import requests import zipfile import os import pandas as pd def download_file(url, filename): headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"} response = requests.get(url, headers=headers, stream=True) response.raise_for_status() with open(filename, "wb") as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) def extract_zip(source_filename, destination_folder): with zipfile.ZipFile(source_filename, "r") as zip_ref: zip_ref.extractall(destination_folder) dataset_dir = "dataset" dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip" dataset_zip = "2024q2.zip" #os.makedirs(dataset_dir, exist_ok=True) #download_file(dataset_link, dataset_zip) #extract_zip(dataset_zip, dataset_dir) num_file_path = os.path.join(dataset_dir, "num.txt") pre_file_path = os.path.join(dataset_dir, "pre.txt") sub_file_path = os.path.join(dataset_dir, "sub.txt") tag_file_path = os.path.join(dataset_dir, "tag.txt") # List of file paths and corresponding DataFrames file_paths = { "sub": os.path.join(dataset_dir, "sub.txt"), "tag": os.path.join(dataset_dir, "tag.txt"), "num": os.path.join(dataset_dir, "num.txt"), "pre": os.path.join(dataset_dir, "pre.txt"), } # Dictionary to hold DataFrames dfs = {} # Primary keys for each DataFrame primary_keys = { "sub": ["adsh"], "tag": ["tag", "version"], "num": ["adsh", "tag", "version", "ddate", "qtrs", "uom"], "pre": ["adsh", "report", "line"], } # Loop to read each file and create a DataFrame for key, path in file_paths.items(): dfs[key] = pd.read_csv(path, sep='\t') # Drop rows where primary key columns are empty dfs[key].dropna(subset=primary_keys[key], inplace=True) # Drop duplicate rows based on primary key columns dfs[key].drop_duplicates(subset=primary_keys[key], inplace=True) print(f"{key.upper()} DataFrame after dropping empty primary keys and duplicates:") print(dfs[key].info()) print("\n")