From 00a59b27ce84adffed573581266b92990699c9b0 Mon Sep 17 00:00:00 2001 From: Leonard Excoffier <48970393+excoffierleonard@users.noreply.github.com> Date: Sat, 31 Aug 2024 15:52:37 -0400 Subject: [PATCH] feat: now writes all datasets to db --- db_schema.sql | 65 +++++++++++++++++++++++++++++--------------------- write_to_db.py | 41 ++++++++++++++++++++++++------- 2 files changed, 71 insertions(+), 35 deletions(-) diff --git a/db_schema.sql b/db_schema.sql index 3189cb7..1c3a953 100644 --- a/db_schema.sql +++ b/db_schema.sql @@ -1,32 +1,8 @@ -- @block -- Create tables -CREATE TABLE num ( - adsh VARCHAR(255), - tag VARCHAR(255), - version VARCHAR(255), - coreg VARCHAR(255), - ddate BIGINT, - qtrs BIGINT, - uom VARCHAR(50), - value DOUBLE PRECISION, - footnote TEXT -); - -CREATE TABLE pre ( - adsh VARCHAR(255), - report BIGINT, - line BIGINT, - stmt VARCHAR(255), - inpth BIGINT, - rfile VARCHAR(255), - tag VARCHAR(255), - version VARCHAR(255), - plabel VARCHAR(255), - negating BIGINT -); - +-- Create 'sub' table CREATE TABLE sub ( - adsh VARCHAR(255), + adsh VARCHAR(255) PRIMARY KEY, cik BIGINT, name VARCHAR(255), sic DOUBLE PRECISION, @@ -64,6 +40,7 @@ CREATE TABLE sub ( aciks VARCHAR(255) ); +-- Create 'tag' table CREATE TABLE tag ( tag VARCHAR(255), version VARCHAR(255), @@ -73,7 +50,41 @@ CREATE TABLE tag ( iord VARCHAR(50), crdr VARCHAR(50), tlabel VARCHAR(255), - doc TEXT + doc TEXT, + PRIMARY KEY (tag, version) +); + +-- Create 'num' table +CREATE TABLE num ( + adsh VARCHAR(255), + tag VARCHAR(255), + version VARCHAR(255), + coreg VARCHAR(255), + ddate BIGINT, + qtrs BIGINT, + uom VARCHAR(50), + value DOUBLE PRECISION, + footnote TEXT, + PRIMARY KEY (adsh, tag, version, ddate), + FOREIGN KEY (adsh) REFERENCES sub(adsh), + FOREIGN KEY (tag, version) REFERENCES tag(tag, version) +); + +-- Create 'pre' table +CREATE TABLE pre ( + adsh VARCHAR(255), + report BIGINT, + line BIGINT, + stmt VARCHAR(255), + inpth BIGINT, + rfile VARCHAR(255), + tag VARCHAR(255), + version VARCHAR(255), + plabel VARCHAR(255), + negating BIGINT, + PRIMARY KEY (adsh, report, line), + FOREIGN KEY (adsh) REFERENCES sub(adsh), + FOREIGN KEY (tag, version) REFERENCES tag(tag, version) ); -- @end diff --git a/write_to_db.py b/write_to_db.py index 9957423..27d779b 100644 --- a/write_to_db.py +++ b/write_to_db.py @@ -1,15 +1,34 @@ +import os import pandas as pd +from sqlalchemy import create_engine +from dotenv import load_dotenv -# Define a list of file paths for easy modification +# Load environment variables from .env file +load_dotenv() + +# Get DB connection parameters from environment +DB_USER = os.getenv('DB_USER') +DB_PASSWORD = os.getenv('DB_PASSWORD') +DB_HOST = os.getenv('DB_HOST') +DB_PORT = os.getenv('DB_PORT') +DB_NAME = os.getenv('DB_NAME') + +# Create a connection string +connection_string = f"mariadb+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" + +# Create the SQLAlchemy engine +engine = create_engine(connection_string) + +# Define a list of file paths and corresponding table names file_paths = [ - 'sec_data/2024q1/num.txt', - 'sec_data/2024q1/pre.txt', - 'sec_data/2024q1/sub.txt', - 'sec_data/2024q1/tag.txt' + ('sec_data/2024q1/num.txt', 'num'), + ('sec_data/2024q1/pre.txt', 'pre'), + ('sec_data/2024q1/sub.txt', 'sub'), + ('sec_data/2024q1/tag.txt', 'tag') ] -# Loop through each file and perform analysis -for i, file_path in enumerate(file_paths): +# Loop through each file and write the data to the database +for i, (file_path, table_name) in enumerate(file_paths): print(f"\nAnalyzing {file_path} (File {i+1}/4)...") # Read the data into a Pandas DataFrame @@ -26,4 +45,10 @@ for i, file_path in enumerate(file_paths): # Check if there are any missing values in the DataFrame missing_values = df.isnull().sum() print("\nMissing Values:") - print(missing_values) \ No newline at end of file + print(missing_values) + + # Write the DataFrame to the corresponding table in the MariaDB database + df.to_sql(table_name, con=engine, if_exists='replace', index=False) + print(f"\nData from {file_path} written to the '{table_name}' table in the database.") + +print("\nAll files have been processed and written to the database.") \ No newline at end of file