feat: now writes all datasets to db

This commit is contained in:
Leonard Excoffier
2024-08-31 15:52:37 -04:00
parent 015879ef4d
commit 00a59b27ce
2 changed files with 71 additions and 35 deletions

View File

@@ -1,32 +1,8 @@
-- @block
-- Create tables
CREATE TABLE num (
adsh VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
coreg VARCHAR(255),
ddate BIGINT,
qtrs BIGINT,
uom VARCHAR(50),
value DOUBLE PRECISION,
footnote TEXT
);
CREATE TABLE pre (
adsh VARCHAR(255),
report BIGINT,
line BIGINT,
stmt VARCHAR(255),
inpth BIGINT,
rfile VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
plabel VARCHAR(255),
negating BIGINT
);
-- Create 'sub' table
CREATE TABLE sub (
adsh VARCHAR(255),
adsh VARCHAR(255) PRIMARY KEY,
cik BIGINT,
name VARCHAR(255),
sic DOUBLE PRECISION,
@@ -64,6 +40,7 @@ CREATE TABLE sub (
aciks VARCHAR(255)
);
-- Create 'tag' table
CREATE TABLE tag (
tag VARCHAR(255),
version VARCHAR(255),
@@ -73,7 +50,41 @@ CREATE TABLE tag (
iord VARCHAR(50),
crdr VARCHAR(50),
tlabel VARCHAR(255),
doc TEXT
doc TEXT,
PRIMARY KEY (tag, version)
);
-- Create 'num' table
CREATE TABLE num (
adsh VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
coreg VARCHAR(255),
ddate BIGINT,
qtrs BIGINT,
uom VARCHAR(50),
value DOUBLE PRECISION,
footnote TEXT,
PRIMARY KEY (adsh, tag, version, ddate),
FOREIGN KEY (adsh) REFERENCES sub(adsh),
FOREIGN KEY (tag, version) REFERENCES tag(tag, version)
);
-- Create 'pre' table
CREATE TABLE pre (
adsh VARCHAR(255),
report BIGINT,
line BIGINT,
stmt VARCHAR(255),
inpth BIGINT,
rfile VARCHAR(255),
tag VARCHAR(255),
version VARCHAR(255),
plabel VARCHAR(255),
negating BIGINT,
PRIMARY KEY (adsh, report, line),
FOREIGN KEY (adsh) REFERENCES sub(adsh),
FOREIGN KEY (tag, version) REFERENCES tag(tag, version)
);
-- @end

View File

@@ -1,15 +1,34 @@
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
# Define a list of file paths for easy modification
# Load environment variables from .env file
load_dotenv()
# Get DB connection parameters from environment
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')
# Create a connection string
connection_string = f"mariadb+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
# Create the SQLAlchemy engine
engine = create_engine(connection_string)
# Define a list of file paths and corresponding table names
file_paths = [
'sec_data/2024q1/num.txt',
'sec_data/2024q1/pre.txt',
'sec_data/2024q1/sub.txt',
'sec_data/2024q1/tag.txt'
('sec_data/2024q1/num.txt', 'num'),
('sec_data/2024q1/pre.txt', 'pre'),
('sec_data/2024q1/sub.txt', 'sub'),
('sec_data/2024q1/tag.txt', 'tag')
]
# Loop through each file and perform analysis
for i, file_path in enumerate(file_paths):
# Loop through each file and write the data to the database
for i, (file_path, table_name) in enumerate(file_paths):
print(f"\nAnalyzing {file_path} (File {i+1}/4)...")
# Read the data into a Pandas DataFrame
@@ -27,3 +46,9 @@ for i, file_path in enumerate(file_paths):
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)
# Write the DataFrame to the corresponding table in the MariaDB database
df.to_sql(table_name, con=engine, if_exists='replace', index=False)
print(f"\nData from {file_path} written to the '{table_name}' table in the database.")
print("\nAll files have been processed and written to the database.")