From e586c2b202c0cd496772528a2846decbe920c5e9 Mon Sep 17 00:00:00 2001 From: Leonard Excoffier <48970393+excoffierleonard@users.noreply.github.com> Date: Tue, 10 Sep 2024 21:38:43 -0400 Subject: [PATCH] feat: downloads and extract the zip. --- .gitignore | 4 +++- fs_datasets.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 fs_datasets.py diff --git a/.gitignore b/.gitignore index 4d56f9a..f41b9c8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ sec_data/ CIK* .vscode stockdb.session.sql -temp.json \ No newline at end of file +temp.json +dataset/ +2024q2.zip \ No newline at end of file diff --git a/fs_datasets.py b/fs_datasets.py new file mode 100644 index 0000000..8a588ec --- /dev/null +++ b/fs_datasets.py @@ -0,0 +1,23 @@ +import requests +import zipfile +import os + +def download_file(url, filename): + headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"} + response = requests.get(url, headers=headers, stream=True) + response.raise_for_status() + with open(filename, "wb") as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + +def extract_zip(source_filename, destination_folder): + with zipfile.ZipFile(source_filename, "r") as zip_ref: + zip_ref.extractall(destination_folder) + +dataset_dir = "dataset" +dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip" +dataset_zip = "2024q2.zip" + +os.makedirs(dataset_dir, exist_ok=True) +download_file(dataset_link, dataset_zip) +extract_zip(dataset_zip, dataset_dir) \ No newline at end of file