feat: advancing on new script
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,3 +7,4 @@ stockdb.session.sql
|
|||||||
temp.json
|
temp.json
|
||||||
dataset/
|
dataset/
|
||||||
2024q2.zip
|
2024q2.zip
|
||||||
|
num.txt
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import requests
|
import requests
|
||||||
import zipfile
|
import zipfile
|
||||||
import os
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
def download_file(url, filename):
|
def download_file(url, filename):
|
||||||
headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"}
|
headers = {"User-Agent": "LeonardExcoffier/1.0 (excoffier.leonard@gmail.com)"}
|
||||||
@@ -18,6 +19,44 @@ dataset_dir = "dataset"
|
|||||||
dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip"
|
dataset_link = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip"
|
||||||
dataset_zip = "2024q2.zip"
|
dataset_zip = "2024q2.zip"
|
||||||
|
|
||||||
os.makedirs(dataset_dir, exist_ok=True)
|
#os.makedirs(dataset_dir, exist_ok=True)
|
||||||
download_file(dataset_link, dataset_zip)
|
#download_file(dataset_link, dataset_zip)
|
||||||
extract_zip(dataset_zip, dataset_dir)
|
#extract_zip(dataset_zip, dataset_dir)
|
||||||
|
|
||||||
|
num_file_path = os.path.join(dataset_dir, "num.txt")
|
||||||
|
pre_file_path = os.path.join(dataset_dir, "pre.txt")
|
||||||
|
sub_file_path = os.path.join(dataset_dir, "sub.txt")
|
||||||
|
tag_file_path = os.path.join(dataset_dir, "tag.txt")
|
||||||
|
|
||||||
|
# List of file paths and corresponding DataFrames
|
||||||
|
file_paths = {
|
||||||
|
"sub": os.path.join(dataset_dir, "sub.txt"),
|
||||||
|
"tag": os.path.join(dataset_dir, "tag.txt"),
|
||||||
|
"num": os.path.join(dataset_dir, "num.txt"),
|
||||||
|
"pre": os.path.join(dataset_dir, "pre.txt"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Dictionary to hold DataFrames
|
||||||
|
dfs = {}
|
||||||
|
|
||||||
|
# Primary keys for each DataFrame
|
||||||
|
primary_keys = {
|
||||||
|
"sub": ["adsh"],
|
||||||
|
"tag": ["tag", "version"],
|
||||||
|
"num": ["adsh", "tag", "version", "ddate", "qtrs", "uom"],
|
||||||
|
"pre": ["adsh", "report", "line"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Loop to read each file and create a DataFrame
|
||||||
|
for key, path in file_paths.items():
|
||||||
|
dfs[key] = pd.read_csv(path, sep='\t')
|
||||||
|
|
||||||
|
# Drop rows where primary key columns are empty
|
||||||
|
dfs[key].dropna(subset=primary_keys[key], inplace=True)
|
||||||
|
|
||||||
|
# Drop duplicate rows based on primary key columns
|
||||||
|
dfs[key].drop_duplicates(subset=primary_keys[key], inplace=True)
|
||||||
|
|
||||||
|
print(f"{key.upper()} DataFrame after dropping empty primary keys and duplicates:")
|
||||||
|
print(dfs[key].info())
|
||||||
|
print("\n")
|
||||||
|
|||||||
Reference in New Issue
Block a user