From 9c5afa76705b8b8f02ad3b4b510e80c6fdf26615 Mon Sep 17 00:00:00 2001
From: Leonard Excoffier <48970393+excoffierleonard@users.noreply.github.com>
Date: Sat, 31 Aug 2024 00:38:50 -0400
Subject: [PATCH] feat: script cycle works

---
 new_write.py | 128 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 82 insertions(+), 46 deletions(-)

diff --git a/new_write.py b/new_write.py
index d7ab3b0..5e9cabb 100644
--- a/new_write.py
+++ b/new_write.py
@@ -1,9 +1,9 @@
 import json
 import pandas as pd
-from sqlalchemy import create_engine
-from sqlalchemy.types import Integer, String, Float
+from sqlalchemy import create_engine, Integer, String, Float, Date, Text
 from dotenv import load_dotenv
 import os
+import glob
 
 # Load environment variables from .env file
 load_dotenv()
@@ -19,60 +19,96 @@ DB_NAME = os.getenv('DB_NAME')
 db_connection_str = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
 engine = create_engine(db_connection_str)
 
-# Load the JSON data as before
-with open('CIK0001937441.json') as f:
-    data = json.load(f)
+# List all JSON files in the target directory
+json_files = glob.glob('./sec_data/companyfacts/*.json')
 
-cik = data.get('cik')
-entity_name = data.get('entityName')
-facts = data.get('facts')
+# Initialize a counter
+file_count = len(json_files)
+current_file = 1
 
-rows = []
-
-for taxonomy, fact_items in facts.items():
-    for fact_id, fact_data in fact_items.items():
-        label = fact_data.get('label')
-        description = fact_data.get('description')
-        units = fact_data.get('units')
-        
-        for unit, details_list in units.items():
-            for details in details_list:
-                row = {
-                    'entity_cik': cik,
-                    'entity_name': entity_name,
-                    'fact_id': fact_id,
-                    'fact_taxonomy': taxonomy,
-                    'fact_label': label,
-                    'fact_description': description,
-                    'fact_unit': unit
-                }
-                row.update(details)
-                rows.append(row)
-
-# Create DataFrame from collected rows
-df = pd.DataFrame(rows)
-
-# Define DataFrame to SQL types (optional, but recommended for performance)
+# Data type mapping for the DataFrame to SQL conversion
 dtype_map = {
-    'entity_cik': String(10),
+    'entity_cik': Integer,
     'entity_name': String(255),
     'fact_id': String(255),
     'fact_taxonomy': String(255),
     'fact_label': String(255),
-    'fact_description': String(255),
-    'fact_unit': String(50),
-    'start': String(10),
-    'end': String(10),
+    'fact_description': Text,
+    'fact_unit': String(255),
+    'start': Date,
+    'end': Date,
     'val': Float,
     'accn': String(50),
     'fy': Integer,
-    'fp': String(10),
-    'form': String(10),
-    'filed': String(10),
-    'frame': String(50)
+    'fp': String(255),
+    'form': String(255),
+    'filed': Date,
+    'frame': String(255)
 }
 
-# Write DataFrame to the 'data' table in the MariaDB database, create the table if it doesn't exist
-df.to_sql('data', con=engine, if_exists='replace', index=False, dtype=dtype_map)
+for json_file in json_files:
+    try:
+        # Load the JSON data
+        with open(json_file) as f:
+            data = json.load(f)
 
-print("Data successfully written to the database.")
\ No newline at end of file
+        # Informing the user about the current file being processed
+        print(f"Processing file {current_file}/{file_count}: {json_file}")
+        current_file += 1
+
+        # Check if the JSON has the keys we're interested in
+        cik = data.get('cik', None)
+        entity_name = data.get('entityName', None)
+        facts = data.get('facts', {})
+
+        # Initialize a list to hold rows
+        rows = []
+
+        # Skip files that don't have facts to process
+        if not facts:
+            print(f"File {json_file} has no facts to process. Skipping...")
+            continue
+
+        # Process the facts dynamically
+        for taxonomy, fact_items in facts.items(): 
+            for fact_id, fact_data in fact_items.items():
+                label = fact_data.get('label', None)
+                description = fact_data.get('description', None)
+                units = fact_data.get('units', {})
+
+                for unit, details_list in units.items():
+                    for details in details_list:
+                        # Generate row dictionary dynamically, only updating non-None values
+                        row = {
+                            'entity_cik': cik,
+                            'entity_name': entity_name,
+                            'fact_id': fact_id,
+                            'fact_taxonomy': taxonomy,
+                            'fact_label': label,
+                            'fact_description': description,
+                            'fact_unit': unit
+                        }
+
+                        # Add the remaining keys in details to row
+                        for key, value in details.items():
+                            row[key] = value
+                        
+                        # Append the row to rows list
+                        rows.append(row)
+
+        # Create DataFrame only if there are rows
+        if rows:
+            df = pd.DataFrame(rows)
+
+            # Write DataFrame to the 'data' table, appending if table exists
+            df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
+        else:
+            print(f"No data rows were created for file {json_file}. Skipping insert...")
+
+    except json.JSONDecodeError as e:
+        print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")
+
+    except Exception as e:
+        print(f"An error occurred while processing file {json_file}: {e}")
+
+print("Processing complete. All files have been handled.")
\ No newline at end of file