-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathScraper.py
More file actions
144 lines (119 loc) · 6.29 KB
/
Scraper.py
File metadata and controls
144 lines (119 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 14 23:33:36 2023
@author: apsin
"""
import zipfile
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO
# Define the URL where the zip files are located
base_url= 'https://www.sec.gov'
url = 'https://www.sec.gov/dera/data/crowdfunding-offerings-data-sets' #dataset url
# Send an HTTP GET request to the URL
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} # user agent was added in order to avoid 403 error
response = requests.get(url, headers=headers)
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links to zip files on the webpage
zip_links = [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip')]
# Initialize an empty list to store dataframes
def zipFileToDf(zip_links, fileName):
dataframes = []
for zip_link in zip_links:
# Send an HTTP GET request to download the zip file
response = requests.get(zip_link, headers=headers)
# Check if the request was successful
if response.status_code == 200:
# Extract the zip file content
zip_data = BytesIO(response.content)
zip_file = zipfile.ZipFile(zip_data, 'r')
# Define the folder inside the zip file where the date file, with fileName is located
folder_name = zip_link[-13:-4].replace('q','Q') #the folder name is part of link string i.e. same as zip file name but with uppercase Q
# Specify the file path within the zip file
file_path_in_zip = f'{folder_name}/{fileName}'
# Check if 'FORM_C_DISCLOSURE.tsv' exists in the specified folder inside the zip file
if file_path_in_zip in zip_file.namelist():
# Extract 'FORM_C_DISCLOSURE.tsv' from the specified folder inside the zip file
tsv_data = zip_file.open(file_path_in_zip)
# Read the TSV data into a Pandas dataframe
df = pd.read_csv(tsv_data, sep='\t')
# Append the dataframe to the list
dataframes.append(df)
# Close the TSV file
tsv_data.close()
else:
print(f"'{fileName} not found in {zip_link}/{folder_name}")
# Close the zip file
zip_file.close()
else:
print(f"Failed to download {zip_link}")
# Combine all dataframes into one and return it
return pd.concat(dataframes, ignore_index=True)
df_disclosure= zipFileToDf(zip_links, 'FORM_C_DISCLOSURE.tsv')
#df_CoIssuer= zipFileToDf(zip_links, 'FORM_C_COISSUER_INFORMATION.tsv')
df_Issuer= zipFileToDf(zip_links, 'FORM_C_ISSUER_INFORMATION.tsv')
df_Submission= zipFileToDf(zip_links, 'FORM_C_SUBMISSION.tsv')
df_merged= pd.merge(df_disclosure,df_Issuer, on= 'ACCESSION_NUMBER', how='left')
df_merged= pd.merge(df_merged,df_Submission, on= 'ACCESSION_NUMBER', how='left')
columns_to_drop= ['STREET1', 'STREET2', 'COMMISSIONFILENUMBER', 'CRDNUMBER', 'PERIOD']
df_merged.drop(columns= columns_to_drop, axis=1, inplace=True)
#For renaming columns
# https://www.sec.gov/dera/data/crowdfund.pdf provides detailed overview of the column descriptions/context names contained in the scraped data
column_mapping = {
'ACCESSION_NUMBER': 'accessionNumber',
'COMPENSATIONAMOUNT': 'compensationAmount',
'FINANCIALINTEREST': 'financialInterest',
'SECURITYOFFEREDTYPE': 'securityOfferedType',
'SECURITYOFFEREDOTHERDESC': 'securityOfferedOtherDesc',
'NOOFSECURITYOFFERED': 'noOfSecurityOffered',
'PRICE': 'price',
'PRICEDETERMINATIONMETHOD': 'priceDeterminationMethod',
'OFFERINGAMOUNT': 'targetOfferingAmount',
'OVERSUBSCRIPTIONACCEPTED': 'oversubscriptionAccepted',
'OVERSUBSCRIPTIONALLOCATIONTYPE': 'oversubscriptionAllocationType',
'DESCOVERSUBSCRIPTION': 'oversubscriptionDesc',
'MAXIMUMOFFERINGAMOUNT': 'maximumOfferingAmount',
'DEADLINEDATE': 'campaignDeadlineDate',
'CURRENTEMPLOYEES': 'currentEmployees',
'TOTALASSETMOSTRECENTFISCALYEAR': 'totalAssetMostRecentFiscalYear',
'TOTALASSETPRIORFISCALYEAR': 'totalAssetPriorFiscalYear',
'CASHEQUIMOSTRECENTFISCALYEAR': 'cashEqMostRecentFiscalYear',
'CASHEQUIPRIORFISCALYEAR': 'cashEqPriorFiscalYear',
'ACTRECEIVEDRECENTFISCALYEAR': 'accountsReceivedRecentFiscalYear',
'ACTRECEIVEDPRIORFISCALYEAR': 'accountsReceivedPriorFiscalYear',
'SHORTTERMDEBTMRECENTFISCALYEAR': 'shortTermDebtMRecentFiscalYear',
'SHORTTERMDEBTPRIORFISCALYEAR': 'shortTermDebtPriorFiscalYear',
'LONGTERMDEBTRECENTFISCALYEAR': 'longTermDebtRecentFiscalYear',
'LONGTERMDEBTPRIORFISCALYEAR': 'longTermDebtPriorFiscalYear',
'REVENUEMOSTRECENTFISCALYEAR': 'revenueMostRecentFiscalYear',
'REVENUEPRIORFISCALYEAR': 'revenuePriorFiscalYear',
'COSTGOODSSOLDRECENTFISCALYEAR': 'costGoodsSoldRecentFiscalYear',
'COSTGOODSSOLDPRIORFISCALYEAR': 'costGoodsSoldPriorFiscalYear',
'TAXPAIDMOSTRECENTFISCALYEAR': 'taxPaidMostRecentFiscalYear',
'TAXPAIDPRIORFISCALYEAR': 'taxPaidPriorFiscalYear',
'NETINCOMEMOSTRECENTFISCALYEAR': 'netIncomeMostRecentFiscalYear',
'NETINCOMEPRIORFISCALYEAR': 'netIncomePriorFiscalYear',
'ISAMENDMENT': 'isAmendment',
'PROGRESSUPDATE': 'progressUpdate',
'NATUREOFAMENDMENT': 'natureOfAmendment',
'NAMEOFISSUER': 'issuerName',
'LEGALSTATUSFORM': 'legalStatusForm',
'LEGALSTATUSOTHERDESC': 'legalStatusOtherDesc',
'JURISDICTIONORGANIZATION': 'jurisdictionOrganization',
'DATEINCORPORATION': 'dateIncorporation',
'CITY': 'city',
'STATEORCOUNTRY': 'stateOrCountry',
'ZIPCODE': 'zipCode',
'ISSUERWEBSITE': 'issuerWebsite',
'COMPANYNAME': 'IntermediaryName',
'ISCOISSUER': 'isCoIssuer',
'SUBMISSION_TYPE': 'submissionType',
'FILING_DATE': 'filingDate',
'CIK': 'issuerCIK',
'FILE_NUMBER': 'fileNumber',
'COMMISSIONCIK': 'IntermediaryCIK'
}
df_merged.rename(columns=column_mapping, inplace=True)
df_merged.to_csv('scrapedCrowdOfferingsData.csv', index=False)