feat: now writes all datasets to db

This commit is contained in:
Leonard Excoffier
2024-08-31 15:52:37 -04:00
parent 015879ef4d
commit 00a59b27ce
2 changed files with 71 additions and 35 deletions

View File

@@ -1,32 +1,8 @@
-- @block -- @block
-- Create tables -- Create tables
CREATE TABLE num ( -- Create 'sub' table
adsh VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
coreg VARCHAR(255),
ddate BIGINT,
qtrs BIGINT,
uom VARCHAR(50),
value DOUBLE PRECISION,
footnote TEXT
);
CREATE TABLE pre (
adsh VARCHAR(255),
report BIGINT,
line BIGINT,
stmt VARCHAR(255),
inpth BIGINT,
rfile VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
plabel VARCHAR(255),
negating BIGINT
);
CREATE TABLE sub ( CREATE TABLE sub (
adsh VARCHAR(255), adsh VARCHAR(255) PRIMARY KEY,
cik BIGINT, cik BIGINT,
name VARCHAR(255), name VARCHAR(255),
sic DOUBLE PRECISION, sic DOUBLE PRECISION,
@@ -64,6 +40,7 @@ CREATE TABLE sub (
aciks VARCHAR(255) aciks VARCHAR(255)
); );
-- Create 'tag' table
CREATE TABLE tag ( CREATE TABLE tag (
tag VARCHAR(255), tag VARCHAR(255),
version VARCHAR(255), version VARCHAR(255),
@@ -73,7 +50,41 @@ CREATE TABLE tag (
iord VARCHAR(50), iord VARCHAR(50),
crdr VARCHAR(50), crdr VARCHAR(50),
tlabel VARCHAR(255), tlabel VARCHAR(255),
doc TEXT doc TEXT,
PRIMARY KEY (tag, version)
);
-- Create 'num' table
CREATE TABLE num (
adsh VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
coreg VARCHAR(255),
ddate BIGINT,
qtrs BIGINT,
uom VARCHAR(50),
value DOUBLE PRECISION,
footnote TEXT,
PRIMARY KEY (adsh, tag, version, ddate),
FOREIGN KEY (adsh) REFERENCES sub(adsh),
FOREIGN KEY (tag, version) REFERENCES tag(tag, version)
);
-- Create 'pre' table
CREATE TABLE pre (
adsh VARCHAR(255),
report BIGINT,
line BIGINT,
stmt VARCHAR(255),
inpth BIGINT,
rfile VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
plabel VARCHAR(255),
negating BIGINT,
PRIMARY KEY (adsh, report, line),
FOREIGN KEY (adsh) REFERENCES sub(adsh),
FOREIGN KEY (tag, version) REFERENCES tag(tag, version)
); );
-- @end -- @end

View File

@@ -1,15 +1,34 @@
import os
import pandas as pd import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
# Define a list of file paths for easy modification # Load environment variables from .env file
load_dotenv()
# Get DB connection parameters from environment
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')
# Create a connection string
connection_string = f"mariadb+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
# Create the SQLAlchemy engine
engine = create_engine(connection_string)
# Define a list of file paths and corresponding table names
file_paths = [ file_paths = [
'sec_data/2024q1/num.txt', ('sec_data/2024q1/num.txt', 'num'),
'sec_data/2024q1/pre.txt', ('sec_data/2024q1/pre.txt', 'pre'),
'sec_data/2024q1/sub.txt', ('sec_data/2024q1/sub.txt', 'sub'),
'sec_data/2024q1/tag.txt' ('sec_data/2024q1/tag.txt', 'tag')
] ]
# Loop through each file and perform analysis # Loop through each file and write the data to the database
for i, file_path in enumerate(file_paths): for i, (file_path, table_name) in enumerate(file_paths):
print(f"\nAnalyzing {file_path} (File {i+1}/4)...") print(f"\nAnalyzing {file_path} (File {i+1}/4)...")
# Read the data into a Pandas DataFrame # Read the data into a Pandas DataFrame
@@ -27,3 +46,9 @@ for i, file_path in enumerate(file_paths):
missing_values = df.isnull().sum() missing_values = df.isnull().sum()
print("\nMissing Values:") print("\nMissing Values:")
print(missing_values) print(missing_values)
# Write the DataFrame to the corresponding table in the MariaDB database
df.to_sql(table_name, con=engine, if_exists='replace', index=False)
print(f"\nData from {file_path} written to the '{table_name}' table in the database.")
print("\nAll files have been processed and written to the database.")