feat: now writes all datasets to db

2024-08-31 15:52:37 -04:00
parent 015879ef4d
commit 00a59b27ce
2 changed files with 71 additions and 35 deletions
@@ -1,32 +1,8 @@
 -- @block
 -- Create tables
-CREATE TABLE num (
-    adsh      VARCHAR(255),
-    tag       VARCHAR(255),
-    version   VARCHAR(255),
-    coreg     VARCHAR(255),
-    ddate     BIGINT,
-    qtrs      BIGINT,
-    uom       VARCHAR(50),
-    value     DOUBLE PRECISION,
-    footnote  TEXT      
-);
-
-CREATE TABLE pre (
-    adsh      VARCHAR(255),
-    report    BIGINT,
-    line      BIGINT,
-    stmt      VARCHAR(255),
-    inpth     BIGINT,
-    rfile     VARCHAR(255),
-    tag       VARCHAR(255),
-    version   VARCHAR(255),
-    plabel    VARCHAR(255),
-    negating  BIGINT
-);
-
+-- Create 'sub' table
 CREATE TABLE sub (
-    adsh        VARCHAR(255),
+    adsh        VARCHAR(255) PRIMARY KEY,
    cik         BIGINT,
    name        VARCHAR(255),
    sic         DOUBLE PRECISION,
@@ -64,6 +40,7 @@ CREATE TABLE sub (
    aciks       VARCHAR(255)
 );

+-- Create 'tag' table
 CREATE TABLE tag (
    tag       VARCHAR(255),
    version   VARCHAR(255),
@@ -73,7 +50,41 @@ CREATE TABLE tag (
    iord      VARCHAR(50),
    crdr      VARCHAR(50),
    tlabel    VARCHAR(255),
-    doc       TEXT
+    doc       TEXT,
+    PRIMARY KEY (tag, version)
+);
+
+-- Create 'num' table
+CREATE TABLE num (
+    adsh      VARCHAR(255),
+    tag       VARCHAR(255),
+    version   VARCHAR(255),
+    coreg     VARCHAR(255),
+    ddate     BIGINT,
+    qtrs      BIGINT,
+    uom       VARCHAR(50),
+    value     DOUBLE PRECISION,
+    footnote  TEXT,
+    PRIMARY KEY (adsh, tag, version, ddate),
+    FOREIGN KEY (adsh) REFERENCES sub(adsh),
+    FOREIGN KEY (tag, version) REFERENCES tag(tag, version)
+);
+
+-- Create 'pre' table
+CREATE TABLE pre (
+    adsh      VARCHAR(255),
+    report    BIGINT,
+    line      BIGINT,
+    stmt      VARCHAR(255),
+    inpth     BIGINT,
+    rfile     VARCHAR(255),
+    tag       VARCHAR(255),
+    version   VARCHAR(255),
+    plabel    VARCHAR(255),
+    negating  BIGINT,
+    PRIMARY KEY (adsh, report, line),
+    FOREIGN KEY (adsh) REFERENCES sub(adsh),
+    FOREIGN KEY (tag, version) REFERENCES tag(tag, version)
 );
 -- @end

@@ -1,15 +1,34 @@
+import os
 import pandas as pd
+from sqlalchemy import create_engine
+from dotenv import load_dotenv

-# Define a list of file paths for easy modification
+# Load environment variables from .env file
+load_dotenv()
+
+# Get DB connection parameters from environment
+DB_USER = os.getenv('DB_USER')
+DB_PASSWORD = os.getenv('DB_PASSWORD')
+DB_HOST = os.getenv('DB_HOST')
+DB_PORT = os.getenv('DB_PORT')
+DB_NAME = os.getenv('DB_NAME')
+
+# Create a connection string
+connection_string = f"mariadb+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
+
+# Create the SQLAlchemy engine
+engine = create_engine(connection_string)
+
+# Define a list of file paths and corresponding table names
 file_paths = [
-    'sec_data/2024q1/num.txt',
-    'sec_data/2024q1/pre.txt',
-    'sec_data/2024q1/sub.txt',
-    'sec_data/2024q1/tag.txt'
+    ('sec_data/2024q1/num.txt', 'num'),
+    ('sec_data/2024q1/pre.txt', 'pre'),
+    ('sec_data/2024q1/sub.txt', 'sub'),
+    ('sec_data/2024q1/tag.txt', 'tag')
 ]

-# Loop through each file and perform analysis
-for i, file_path in enumerate(file_paths):
+# Loop through each file and write the data to the database
+for i, (file_path, table_name) in enumerate(file_paths):
    print(f"\nAnalyzing {file_path} (File {i+1}/4)...")
    
    # Read the data into a Pandas DataFrame
@@ -27,3 +46,9 @@ for i, file_path in enumerate(file_paths):
    missing_values = df.isnull().sum()
    print("\nMissing Values:")
    print(missing_values)
+    
+    # Write the DataFrame to the corresponding table in the MariaDB database
+    df.to_sql(table_name, con=engine, if_exists='replace', index=False)
+    print(f"\nData from {file_path} written to the '{table_name}' table in the database.")
+
+print("\nAll files have been processed and written to the database.")