refactor: total change, no longer ripping gigabytes but going directly to datasets, must find way to automate download.

This commit is contained in:
Leonard Excoffier
2024-08-31 15:32:57 -04:00
parent b5e0af3ca5
commit f31d901201
3 changed files with 89 additions and 180 deletions

View File

@@ -48,13 +48,13 @@ SUBMISSIONS_URL = (
)
# File paths to save the zip files
companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip")
#submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip")
#companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip")
submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip")
# Download the files
download_file(COMPANYFACTS_URL, companyfacts_zip)
#download_file(SUBMISSIONS_URL, submissions_zip)
#download_file(COMPANYFACTS_URL, companyfacts_zip)
download_file(SUBMISSIONS_URL, submissions_zip)
# Extract the files into respective directories
extract_zip(companyfacts_zip, COMPANYFACTS_DIR)
#extract_zip(submissions_zip, SUBMISSIONS_DIR)
#extract_zip(companyfacts_zip, COMPANYFACTS_DIR)
extract_zip(submissions_zip, SUBMISSIONS_DIR)

View File

@@ -1,55 +1,75 @@
CREATE TABLE IF NOT EXISTS entities (
cik INT PRIMARY KEY, -- CIK is now the primary key, ensuring uniqueness
name VARCHAR(255) NOT NULL -- Name of the company
CREATE TABLE num (
adsh VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
coreg VARCHAR(255),
ddate BIGINT,
qtrs BIGINT,
uom VARCHAR(50),
value DOUBLE PRECISION,
footnote TEXT
);
CREATE TABLE IF NOT EXISTS facts (
id VARCHAR(255) PRIMARY KEY, -- Unique identifier for the fact
taxonomy VARCHAR(255), -- Taxonomy of the fact
label VARCHAR(255), -- Label of the fact
description TEXT, -- Description of the fact
unit VARCHAR(255) -- Unit of the fact
CREATE TABLE pre (
adsh VARCHAR(255),
report BIGINT,
line BIGINT,
stmt VARCHAR(255),
inpth BIGINT,
rfile VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
plabel VARCHAR(255),
negating BIGINT
);
CREATE TABLE IF NOT EXISTS data (
cik INT, -- CIK of the company
fact_id VARCHAR(255),
end DATE,
start DATE, -- Start date of the fact
val INT,
accn VARCHAR(255),
fy INT,
fp VARCHAR(255),
form VARCHAR(255),
filed DATE,
frame VARCHAR(255),
PRIMARY KEY (cik, fact_id, end),
FOREIGN KEY (cik) REFERENCES entities(cik),
FOREIGN KEY (fact_id) REFERENCES facts(id)
CREATE TABLE sub (
adsh VARCHAR(255),
cik BIGINT,
name VARCHAR(255),
sic DOUBLE PRECISION,
countryba VARCHAR(100),
stprba VARCHAR(100),
cityba VARCHAR(255),
zipba VARCHAR(50),
bas1 VARCHAR(255),
bas2 VARCHAR(255),
baph VARCHAR(255),
countryma VARCHAR(100),
stprma VARCHAR(100),
cityma VARCHAR(255),
zipma VARCHAR(50),
mas1 VARCHAR(255),
mas2 VARCHAR(255),
countryinc VARCHAR(100),
stprinc VARCHAR(100),
ein BIGINT,
former VARCHAR(255),
changed DOUBLE PRECISION,
afs VARCHAR(255),
wksi BIGINT,
fye DOUBLE PRECISION,
form VARCHAR(50),
period DOUBLE PRECISION,
fy DOUBLE PRECISION,
fp VARCHAR(50),
filed BIGINT,
accepted VARCHAR(255),
prevrpt BIGINT,
detail BIGINT,
instance VARCHAR(255),
nciks BIGINT,
aciks VARCHAR(255)
);
-- @block
CREATE TABLE IF NOT EXISTS data (
entity_cik INT,
entity_name VARCHAR(255),
fact_id VARCHAR(255),
fact_taxonomy VARCHAR(255),
fact_label VARCHAR(255),
fact_description TEXT,
fact_unit VARCHAR(255)
end DATE,
val FLOAT,
accn VARCHAR(50),
fy INT,
fp VARCHAR(255),
form VARCHAR(255),
filed DATE,
frame VARCHAR(255),
start DATE,
PRIMARY KEY (entity_cik, fact_id, end, form),
)
-- @block
CREATE TABLE IF NOT EXISTS data ();
CREATE TABLE tag (
tag VARCHAR(255),
version VARCHAR(255),
custom BIGINT,
abstract BIGINT,
datatype VARCHAR(255),
iord VARCHAR(50),
crdr VARCHAR(50),
tlabel VARCHAR(255),
doc TEXT
);

View File

@@ -1,129 +1,18 @@
import os
import mariadb
import json
from dotenv import load_dotenv
import pandas as pd
# Load environment variables from .env file
load_dotenv()
# Read the data into a Pandas DataFrame
file_path = 'sec_data/2024q1/tag.txt'
df = pd.read_csv(file_path, sep='\t')
def connect_to_db():
try:
# Read the connection parameters from the environment
conn = mariadb.connect(
user=os.getenv("DB_USER"),
password=os.getenv("DB_PASSWORD"),
host=os.getenv("DB_HOST"),
port=int(os.getenv("DB_PORT")),
database=os.getenv("DB_NAME")
)
return conn
except mariadb.Error as e:
print(f"Error connecting to MariaDB: {e}")
return None
# Inspect the DataFrame
print("First rows of the DataFrame:")
print(df.head(10))
def insert_entity(cursor, cik, entity_name):
cursor.execute(
"INSERT IGNORE INTO entities (cik, name) VALUES (?, ?)", (cik, entity_name))
# Get the DataFrame Information
print("\nSummary Information:")
print(df.info())
def insert_fact(cursor, taxonomy, fact_id, label, description, unit):
cursor.execute(
"INSERT IGNORE INTO facts (id, taxonomy, label, description, unit) VALUES (?, ?, ?, ?, ?)",
(fact_id, taxonomy, label, description, unit)
)
def insert_data(cursor, cik, fact_id, start, end, val, accn, fy, fp, form, filed, frame):
cursor.execute(
"""INSERT IGNORE INTO data (cik, fact_id, end, start, val, accn, fy, fp, form, filed, frame)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(cik, fact_id, end, start, val, accn, fy, fp, form, filed, frame)
)
def cik_exists(cursor, cik):
cursor.execute("SELECT 1 FROM entities WHERE cik = ?", (cik,))
return cursor.fetchone() is not None
def parse_json_and_insert_data(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
cik = data.get('cik')
# Start a new connection for each file
conn = connect_to_db()
if conn is None:
return False
try:
cursor = conn.cursor()
# Optional: Check if cik already exists in the database.
# You can comment this block out if you do not want this check.
if cik_exists(cursor, cik):
print(f"CIK {cik} already exists in the database. Skipping file {file_path}.")
return False
# Insert the entity
entity_name = data.get('entityName')
insert_entity(cursor, cik, entity_name)
# Iterate over facts
for taxonomy, fact_details in data['facts'].items():
for fact_id, fact in fact_details.items():
# Get fact details
label = fact.get('label')
description = fact.get('description')
for unit, unit_vals in fact.get('units', {}).items():
# Insert fact
insert_fact(cursor, taxonomy, fact_id, label, description, unit)
# Insert each data point
for entry in unit_vals:
start = entry.get('start', None)
end = entry['end']
val = entry['val']
accn = entry['accn']
fy = entry['fy']
fp = entry['fp']
form = entry['form']
filed = entry['filed']
frame = entry.get('frame', None)
insert_data(cursor, cik, fact_id, start, end, val, accn, fy, fp, form, filed, frame)
# Commit transaction
conn.commit()
return True
except Exception as e:
print(f"Error occurred while processing {file_path}: {e}")
conn.rollback()
return False
finally:
cursor.close()
conn.close()
def process_all_files_in_directory(directory_path):
files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
total_files = len(files)
processed_files = 0
for idx, file_name in enumerate(files, start=1):
file_path = os.path.join(directory_path, file_name)
print(f"Processing file {idx} of {total_files}: {file_name}")
if parse_json_and_insert_data(file_path):
processed_files += 1
print(f"Successfully processed {file_name}")
else:
print(f"Failed to process {file_name}")
print(f"Finished processing {processed_files} out of {total_files} files.")
def main():
# Process all JSON files in the directory
directory_path = './sec_data/companyfacts/'
process_all_files_in_directory(directory_path)
if __name__ == "__main__":
main()
# Check if there are any missing values in the DataFrame
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)