diff --git a/.gitignore b/.gitignore index f41b9c8..87208dc 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ CIK* stockdb.session.sql temp.json dataset/ -2024q2.zip \ No newline at end of file +2024q2.zip +num.txt \ No newline at end of file diff --git a/fs_datasets.py b/fs_datasets.py index 8a588ec..bf88a09 100644 --- a/fs_datasets.py +++ b/fs_datasets.py @@ -1,6 +1,7 @@ import requests import zipfile import os +import pandas as pd def download_file(url, filename): headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"} @@ -18,6 +19,44 @@ dataset_dir = "dataset" dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip" dataset_zip = "2024q2.zip" -os.makedirs(dataset_dir, exist_ok=True) -download_file(dataset_link, dataset_zip) -extract_zip(dataset_zip, dataset_dir) \ No newline at end of file +#os.makedirs(dataset_dir, exist_ok=True) +#download_file(dataset_link, dataset_zip) +#extract_zip(dataset_zip, dataset_dir) + +num_file_path = os.path.join(dataset_dir, "num.txt") +pre_file_path = os.path.join(dataset_dir, "pre.txt") +sub_file_path = os.path.join(dataset_dir, "sub.txt") +tag_file_path = os.path.join(dataset_dir, "tag.txt") + +# List of file paths and corresponding DataFrames +file_paths = { + "sub": os.path.join(dataset_dir, "sub.txt"), + "tag": os.path.join(dataset_dir, "tag.txt"), + "num": os.path.join(dataset_dir, "num.txt"), + "pre": os.path.join(dataset_dir, "pre.txt"), +} + +# Dictionary to hold DataFrames +dfs = {} + +# Primary keys for each DataFrame +primary_keys = { + "sub": ["adsh"], + "tag": ["tag", "version"], + "num": ["adsh", "tag", "version", "ddate", "qtrs", "uom"], + "pre": ["adsh", "report", "line"], +} + +# Loop to read each file and create a DataFrame +for key, path in file_paths.items(): + dfs[key] = pd.read_csv(path, sep='\t') + + # Drop rows where primary key columns are empty + dfs[key].dropna(subset=primary_keys[key], inplace=True) + + # Drop duplicate rows based on primary key columns + dfs[key].drop_duplicates(subset=primary_keys[key], inplace=True) + + print(f"{key.upper()} DataFrame after dropping empty primary keys and duplicates:") + print(dfs[key].info()) + print("\n")