61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
import os
|
|
import zipfile
|
|
|
|
import requests
|
|
|
|
|
|
def download_file(url, filename):
|
|
"""
|
|
Download a file from a URL and save it locally.
|
|
"""
|
|
headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"}
|
|
print(f"Starting download: {filename}")
|
|
response = requests.get(url, headers=headers, stream=True)
|
|
response.raise_for_status() # Check if the request was successful
|
|
with open(filename, "wb") as file:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
file.write(chunk)
|
|
print(f"Download complete: {filename}")
|
|
|
|
|
|
def extract_zip(source_filename, destination_folder):
|
|
"""
|
|
Extract a ZIP file into a target folder.
|
|
"""
|
|
print(f"Starting extraction: {source_filename} -> {destination_folder}")
|
|
with zipfile.ZipFile(source_filename, "r") as zip_ref:
|
|
zip_ref.extractall(destination_folder)
|
|
print(f"Extraction complete: {destination_folder}")
|
|
|
|
|
|
# Directory where files will be downloaded and extracted
|
|
SEC_DATA_DIR = "sec_data"
|
|
|
|
# Define the subfolders inside sec_data
|
|
COMPANYFACTS_DIR = os.path.join(SEC_DATA_DIR, "companyfacts")
|
|
SUBMISSIONS_DIR = os.path.join(SEC_DATA_DIR, "submissions")
|
|
|
|
# Create the directories if they don't exist
|
|
os.makedirs(COMPANYFACTS_DIR, exist_ok=True)
|
|
os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
|
|
|
|
# File URLs
|
|
COMPANYFACTS_URL = (
|
|
"https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip"
|
|
)
|
|
SUBMISSIONS_URL = (
|
|
"https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
|
|
)
|
|
|
|
# File paths to save the zip files
|
|
companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip")
|
|
#submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip")
|
|
|
|
# Download the files
|
|
download_file(COMPANYFACTS_URL, companyfacts_zip)
|
|
#download_file(SUBMISSIONS_URL, submissions_zip)
|
|
|
|
# Extract the files into respective directories
|
|
extract_zip(companyfacts_zip, COMPANYFACTS_DIR)
|
|
#extract_zip(submissions_zip, SUBMISSIONS_DIR)
|