From 9c5afa76705b8b8f02ad3b4b510e80c6fdf26615 Mon Sep 17 00:00:00 2001 From: Leonard Excoffier <48970393+excoffierleonard@users.noreply.github.com> Date: Sat, 31 Aug 2024 00:38:50 -0400 Subject: [PATCH] feat: script cycle works --- new_write.py | 128 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 46 deletions(-) diff --git a/new_write.py b/new_write.py index d7ab3b0..5e9cabb 100644 --- a/new_write.py +++ b/new_write.py @@ -1,9 +1,9 @@ import json import pandas as pd -from sqlalchemy import create_engine -from sqlalchemy.types import Integer, String, Float +from sqlalchemy import create_engine, Integer, String, Float, Date, Text from dotenv import load_dotenv import os +import glob # Load environment variables from .env file load_dotenv() @@ -19,60 +19,96 @@ DB_NAME = os.getenv('DB_NAME') db_connection_str = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}' engine = create_engine(db_connection_str) -# Load the JSON data as before -with open('CIK0001937441.json') as f: - data = json.load(f) +# List all JSON files in the target directory +json_files = glob.glob('./sec_data/companyfacts/*.json') -cik = data.get('cik') -entity_name = data.get('entityName') -facts = data.get('facts') +# Initialize a counter +file_count = len(json_files) +current_file = 1 -rows = [] - -for taxonomy, fact_items in facts.items(): - for fact_id, fact_data in fact_items.items(): - label = fact_data.get('label') - description = fact_data.get('description') - units = fact_data.get('units') - - for unit, details_list in units.items(): - for details in details_list: - row = { - 'entity_cik': cik, - 'entity_name': entity_name, - 'fact_id': fact_id, - 'fact_taxonomy': taxonomy, - 'fact_label': label, - 'fact_description': description, - 'fact_unit': unit - } - row.update(details) - rows.append(row) - -# Create DataFrame from collected rows -df = pd.DataFrame(rows) - -# Define DataFrame to SQL types (optional, but recommended for performance) +# Data type mapping for the DataFrame to SQL conversion dtype_map = { - 'entity_cik': String(10), + 'entity_cik': Integer, 'entity_name': String(255), 'fact_id': String(255), 'fact_taxonomy': String(255), 'fact_label': String(255), - 'fact_description': String(255), - 'fact_unit': String(50), - 'start': String(10), - 'end': String(10), + 'fact_description': Text, + 'fact_unit': String(255), + 'start': Date, + 'end': Date, 'val': Float, 'accn': String(50), 'fy': Integer, - 'fp': String(10), - 'form': String(10), - 'filed': String(10), - 'frame': String(50) + 'fp': String(255), + 'form': String(255), + 'filed': Date, + 'frame': String(255) } -# Write DataFrame to the 'data' table in the MariaDB database, create the table if it doesn't exist -df.to_sql('data', con=engine, if_exists='replace', index=False, dtype=dtype_map) +for json_file in json_files: + try: + # Load the JSON data + with open(json_file) as f: + data = json.load(f) -print("Data successfully written to the database.") \ No newline at end of file + # Informing the user about the current file being processed + print(f"Processing file {current_file}/{file_count}: {json_file}") + current_file += 1 + + # Check if the JSON has the keys we're interested in + cik = data.get('cik', None) + entity_name = data.get('entityName', None) + facts = data.get('facts', {}) + + # Initialize a list to hold rows + rows = [] + + # Skip files that don't have facts to process + if not facts: + print(f"File {json_file} has no facts to process. Skipping...") + continue + + # Process the facts dynamically + for taxonomy, fact_items in facts.items(): + for fact_id, fact_data in fact_items.items(): + label = fact_data.get('label', None) + description = fact_data.get('description', None) + units = fact_data.get('units', {}) + + for unit, details_list in units.items(): + for details in details_list: + # Generate row dictionary dynamically, only updating non-None values + row = { + 'entity_cik': cik, + 'entity_name': entity_name, + 'fact_id': fact_id, + 'fact_taxonomy': taxonomy, + 'fact_label': label, + 'fact_description': description, + 'fact_unit': unit + } + + # Add the remaining keys in details to row + for key, value in details.items(): + row[key] = value + + # Append the row to rows list + rows.append(row) + + # Create DataFrame only if there are rows + if rows: + df = pd.DataFrame(rows) + + # Write DataFrame to the 'data' table, appending if table exists + df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map) + else: + print(f"No data rows were created for file {json_file}. Skipping insert...") + + except json.JSONDecodeError as e: + print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}") + + except Exception as e: + print(f"An error occurred while processing file {json_file}: {e}") + +print("Processing complete. All files have been handled.") \ No newline at end of file