feat: advancing on new script

This commit is contained in:
Leonard Excoffier
2024-09-10 22:23:14 -04:00
parent e586c2b202
commit e17a54ba3c
2 changed files with 44 additions and 4 deletions

1
.gitignore vendored
View File

@@ -7,3 +7,4 @@ stockdb.session.sql
temp.json temp.json
dataset/ dataset/
2024q2.zip 2024q2.zip
num.txt

View File

@@ -1,6 +1,7 @@
import requests import requests
import zipfile import zipfile
import os import os
import pandas as pd
def download_file(url, filename): def download_file(url, filename):
headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"} headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"}
@@ -18,6 +19,44 @@ dataset_dir = "dataset"
dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip" dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip"
dataset_zip = "2024q2.zip" dataset_zip = "2024q2.zip"
os.makedirs(dataset_dir, exist_ok=True) #os.makedirs(dataset_dir, exist_ok=True)
download_file(dataset_link, dataset_zip) #download_file(dataset_link, dataset_zip)
extract_zip(dataset_zip, dataset_dir) #extract_zip(dataset_zip, dataset_dir)
num_file_path = os.path.join(dataset_dir, "num.txt")
pre_file_path = os.path.join(dataset_dir, "pre.txt")
sub_file_path = os.path.join(dataset_dir, "sub.txt")
tag_file_path = os.path.join(dataset_dir, "tag.txt")
# List of file paths and corresponding DataFrames
file_paths = {
"sub": os.path.join(dataset_dir, "sub.txt"),
"tag": os.path.join(dataset_dir, "tag.txt"),
"num": os.path.join(dataset_dir, "num.txt"),
"pre": os.path.join(dataset_dir, "pre.txt"),
}
# Dictionary to hold DataFrames
dfs = {}
# Primary keys for each DataFrame
primary_keys = {
"sub": ["adsh"],
"tag": ["tag", "version"],
"num": ["adsh", "tag", "version", "ddate", "qtrs", "uom"],
"pre": ["adsh", "report", "line"],
}
# Loop to read each file and create a DataFrame
for key, path in file_paths.items():
dfs[key] = pd.read_csv(path, sep='\t')
# Drop rows where primary key columns are empty
dfs[key].dropna(subset=primary_keys[key], inplace=True)
# Drop duplicate rows based on primary key columns
dfs[key].drop_duplicates(subset=primary_keys[key], inplace=True)
print(f"{key.upper()} DataFrame after dropping empty primary keys and duplicates:")
print(dfs[key].info())
print("\n")