refactor: total change, no longer ripping gigabytes but going directly to datasets, must find way to automate download.

2024-08-31 15:32:57 -04:00
parent b5e0af3ca5
commit f31d901201
3 changed files with 89 additions and 180 deletions
--- a/data_dl.py
+++ b/data_dl.py
@@ -48,13 +48,13 @@ SUBMISSIONS_URL = (
 )

 # File paths to save the zip files
-companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip")
-#submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip")
+#companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip")
+submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip")

 # Download the files
-download_file(COMPANYFACTS_URL, companyfacts_zip)
-#download_file(SUBMISSIONS_URL, submissions_zip)
+#download_file(COMPANYFACTS_URL, companyfacts_zip)
+download_file(SUBMISSIONS_URL, submissions_zip)

 # Extract the files into respective directories
-extract_zip(companyfacts_zip, COMPANYFACTS_DIR)
-#extract_zip(submissions_zip, SUBMISSIONS_DIR)
+#extract_zip(companyfacts_zip, COMPANYFACTS_DIR)
+extract_zip(submissions_zip, SUBMISSIONS_DIR)
--- a/db_schema.sql
+++ b/db_schema.sql
@@ -1,55 +1,75 @@
-CREATE TABLE IF NOT EXISTS entities (
-    cik INT PRIMARY KEY,               -- CIK is now the primary key, ensuring uniqueness
-    name VARCHAR(255) NOT NULL  -- Name of the company
+CREATE TABLE num (
+    adsh      VARCHAR(255),
+    tag       VARCHAR(255),
+    version   VARCHAR(255),
+    coreg     VARCHAR(255),
+    ddate     BIGINT,
+    qtrs      BIGINT,
+    uom       VARCHAR(50),
+    value     DOUBLE PRECISION,
+    footnote  TEXT      
 );

-CREATE TABLE IF NOT EXISTS facts (
-    id VARCHAR(255) PRIMARY KEY, -- Unique identifier for the fact
-    taxonomy VARCHAR(255),       -- Taxonomy of the fact
-    label VARCHAR(255),       -- Label of the fact
-    description TEXT, -- Description of the fact
-    unit VARCHAR(255)           -- Unit of the fact
+CREATE TABLE pre (
+    adsh      VARCHAR(255),
+    report    BIGINT,
+    line      BIGINT,
+    stmt      VARCHAR(255),
+    inpth     BIGINT,
+    rfile     VARCHAR(255),
+    tag       VARCHAR(255),
+    version   VARCHAR(255),
+    plabel    VARCHAR(255),
+    negating  BIGINT
 );

-CREATE TABLE IF NOT EXISTS data (
-    cik INT,                            -- CIK of the company
-    fact_id VARCHAR(255),
-    end DATE,                 
-    start DATE,                        -- Start date of the fact
-    val INT,
-    accn VARCHAR(255),
-    fy INT,
-    fp VARCHAR(255),
-    form VARCHAR(255),
-    filed DATE,
-    frame VARCHAR(255),
-    PRIMARY KEY (cik, fact_id, end),
-    FOREIGN KEY (cik) REFERENCES entities(cik),
-    FOREIGN KEY (fact_id) REFERENCES facts(id)
+CREATE TABLE sub (
+    adsh        VARCHAR(255),
+    cik         BIGINT,
+    name        VARCHAR(255),
+    sic         DOUBLE PRECISION,
+    countryba   VARCHAR(100),
+    stprba      VARCHAR(100),
+    cityba      VARCHAR(255),
+    zipba       VARCHAR(50),
+    bas1        VARCHAR(255),
+    bas2        VARCHAR(255),
+    baph        VARCHAR(255),
+    countryma   VARCHAR(100),
+    stprma      VARCHAR(100),
+    cityma      VARCHAR(255),
+    zipma       VARCHAR(50),
+    mas1        VARCHAR(255),
+    mas2        VARCHAR(255),
+    countryinc  VARCHAR(100),
+    stprinc     VARCHAR(100),
+    ein         BIGINT,
+    former      VARCHAR(255),
+    changed     DOUBLE PRECISION,
+    afs         VARCHAR(255),
+    wksi        BIGINT,
+    fye         DOUBLE PRECISION,
+    form        VARCHAR(50),
+    period      DOUBLE PRECISION,
+    fy          DOUBLE PRECISION,
+    fp          VARCHAR(50),
+    filed       BIGINT,
+    accepted    VARCHAR(255),
+    prevrpt     BIGINT,
+    detail      BIGINT,
+    instance    VARCHAR(255),
+    nciks       BIGINT,
+    aciks       VARCHAR(255)
 );

-
-- @block
-CREATE TABLE IF NOT EXISTS data (
-    entity_cik INT,
-    entity_name VARCHAR(255),
-    fact_id VARCHAR(255),
-    fact_taxonomy VARCHAR(255),
-    fact_label VARCHAR(255),
-    fact_description TEXT,
-    fact_unit VARCHAR(255)
-    end DATE,
-    val FLOAT,
-    accn VARCHAR(50),
-    fy INT,
-    fp VARCHAR(255),
-    form VARCHAR(255),
-    filed DATE,
-    frame VARCHAR(255),
-    start DATE,
-    PRIMARY KEY (entity_cik, fact_id, end, form),
-    
-)
-
-- @block
-CREATE TABLE IF NOT EXISTS data ();
+CREATE TABLE tag (
+    tag       VARCHAR(255),
+    version   VARCHAR(255),
+    custom    BIGINT,
+    abstract  BIGINT,
+    datatype  VARCHAR(255),
+    iord      VARCHAR(50),
+    crdr      VARCHAR(50),
+    tlabel    VARCHAR(255),
+    doc       TEXT
+);
--- a/write_to_db.py
+++ b/write_to_db.py
@@ -1,129 +1,18 @@
-import os
-import mariadb
-import json
-from dotenv import load_dotenv
+import pandas as pd

-# Load environment variables from .env file
-load_dotenv()
+# Read the data into a Pandas DataFrame
+file_path = 'sec_data/2024q1/tag.txt'
+df = pd.read_csv(file_path, sep='\t')

-def connect_to_db():
-    try:
-        # Read the connection parameters from the environment
-        conn = mariadb.connect(
-            user=os.getenv("DB_USER"),
-            password=os.getenv("DB_PASSWORD"),
-            host=os.getenv("DB_HOST"),
-            port=int(os.getenv("DB_PORT")),
-            database=os.getenv("DB_NAME")
-        )
-        return conn
-    except mariadb.Error as e:
-        print(f"Error connecting to MariaDB: {e}")
-        return None
+# Inspect the DataFrame
+print("First rows of the DataFrame:")
+print(df.head(10))

-def insert_entity(cursor, cik, entity_name):
-    cursor.execute(
-        "INSERT IGNORE INTO entities (cik, name) VALUES (?, ?)", (cik, entity_name))
+# Get the DataFrame Information
+print("\nSummary Information:")
+print(df.info())

-def insert_fact(cursor, taxonomy, fact_id, label, description, unit):
-    cursor.execute(
-        "INSERT IGNORE INTO facts (id, taxonomy, label, description, unit) VALUES (?, ?, ?, ?, ?)",
-        (fact_id, taxonomy, label, description, unit)
-    )
-
-def insert_data(cursor, cik, fact_id, start, end, val, accn, fy, fp, form, filed, frame):
-    cursor.execute(
-        """INSERT IGNORE INTO data (cik, fact_id, end, start, val, accn, fy, fp, form, filed, frame)
-           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
-        (cik, fact_id, end, start, val, accn, fy, fp, form, filed, frame)
-    )
-
-def cik_exists(cursor, cik):
-    cursor.execute("SELECT 1 FROM entities WHERE cik = ?", (cik,))
-    return cursor.fetchone() is not None
-
-def parse_json_and_insert_data(file_path):
-    with open(file_path, 'r') as file:
-        data = json.load(file)
-
-        cik = data.get('cik')
-
-        # Start a new connection for each file
-        conn = connect_to_db()
-        if conn is None:
-            return False 
-        
-        try:
-            cursor = conn.cursor()
-
-            # Optional: Check if cik already exists in the database.
-            # You can comment this block out if you do not want this check.
-            if cik_exists(cursor, cik):
-                print(f"CIK {cik} already exists in the database. Skipping file {file_path}.")
-                return False
-
-            # Insert the entity
-            entity_name = data.get('entityName')
-            insert_entity(cursor, cik, entity_name)
-
-            # Iterate over facts
-            for taxonomy, fact_details in data['facts'].items():
-                for fact_id, fact in fact_details.items():
-                    # Get fact details
-                    label = fact.get('label')
-                    description = fact.get('description')
-
-                    for unit, unit_vals in fact.get('units', {}).items():
-                        # Insert fact
-                        insert_fact(cursor, taxonomy, fact_id, label, description, unit)
-
-                        # Insert each data point
-                        for entry in unit_vals:
-                            start = entry.get('start', None)
-                            end = entry['end']
-                            val = entry['val']
-                            accn = entry['accn']
-                            fy = entry['fy']
-                            fp = entry['fp']
-                            form = entry['form']
-                            filed = entry['filed']
-                            frame = entry.get('frame', None)
-
-                            insert_data(cursor, cik, fact_id, start, end, val, accn, fy, fp, form, filed, frame)
-
-            # Commit transaction
-            conn.commit()
-            return True
-
-        except Exception as e:
-            print(f"Error occurred while processing {file_path}: {e}")
-            conn.rollback()
-            return False
-        finally:
-            cursor.close()
-            conn.close()
-
-def process_all_files_in_directory(directory_path):
-    files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
-    total_files = len(files)
-    processed_files = 0
-
-    for idx, file_name in enumerate(files, start=1):
-        file_path = os.path.join(directory_path, file_name)
-        print(f"Processing file {idx} of {total_files}: {file_name}")
-
-        if parse_json_and_insert_data(file_path):
-            processed_files += 1
-            print(f"Successfully processed {file_name}")
-        else:
-            print(f"Failed to process {file_name}")
-
-    print(f"Finished processing {processed_files} out of {total_files} files.")
-
-def main():
-    # Process all JSON files in the directory
-    directory_path = './sec_data/companyfacts/'
-    process_all_files_in_directory(directory_path)
-
-if __name__ == "__main__":
-    main()
+# Check if there are any missing values in the DataFrame
+missing_values = df.isnull().sum()
+print("\nMissing Values:")
+print(missing_values)