feat: script cycle works

This commit is contained in:
Leonard Excoffier
2024-08-31 00:38:50 -04:00
parent 765272f896
commit 9c5afa7670

View File

@@ -1,9 +1,9 @@
import json import json
import pandas as pd import pandas as pd
from sqlalchemy import create_engine from sqlalchemy import create_engine, Integer, String, Float, Date, Text
from sqlalchemy.types import Integer, String, Float
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
import glob
# Load environment variables from .env file # Load environment variables from .env file
load_dotenv() load_dotenv()
@@ -19,60 +19,96 @@ DB_NAME = os.getenv('DB_NAME')
db_connection_str = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}' db_connection_str = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
engine = create_engine(db_connection_str) engine = create_engine(db_connection_str)
# Load the JSON data as before # List all JSON files in the target directory
with open('CIK0001937441.json') as f: json_files = glob.glob('./sec_data/companyfacts/*.json')
data = json.load(f)
cik = data.get('cik') # Initialize a counter
entity_name = data.get('entityName') file_count = len(json_files)
facts = data.get('facts') current_file = 1
rows = [] # Data type mapping for the DataFrame to SQL conversion
for taxonomy, fact_items in facts.items():
for fact_id, fact_data in fact_items.items():
label = fact_data.get('label')
description = fact_data.get('description')
units = fact_data.get('units')
for unit, details_list in units.items():
for details in details_list:
row = {
'entity_cik': cik,
'entity_name': entity_name,
'fact_id': fact_id,
'fact_taxonomy': taxonomy,
'fact_label': label,
'fact_description': description,
'fact_unit': unit
}
row.update(details)
rows.append(row)
# Create DataFrame from collected rows
df = pd.DataFrame(rows)
# Define DataFrame to SQL types (optional, but recommended for performance)
dtype_map = { dtype_map = {
'entity_cik': String(10), 'entity_cik': Integer,
'entity_name': String(255), 'entity_name': String(255),
'fact_id': String(255), 'fact_id': String(255),
'fact_taxonomy': String(255), 'fact_taxonomy': String(255),
'fact_label': String(255), 'fact_label': String(255),
'fact_description': String(255), 'fact_description': Text,
'fact_unit': String(50), 'fact_unit': String(255),
'start': String(10), 'start': Date,
'end': String(10), 'end': Date,
'val': Float, 'val': Float,
'accn': String(50), 'accn': String(50),
'fy': Integer, 'fy': Integer,
'fp': String(10), 'fp': String(255),
'form': String(10), 'form': String(255),
'filed': String(10), 'filed': Date,
'frame': String(50) 'frame': String(255)
} }
# Write DataFrame to the 'data' table in the MariaDB database, create the table if it doesn't exist for json_file in json_files:
df.to_sql('data', con=engine, if_exists='replace', index=False, dtype=dtype_map) try:
# Load the JSON data
with open(json_file) as f:
data = json.load(f)
print("Data successfully written to the database.") # Informing the user about the current file being processed
print(f"Processing file {current_file}/{file_count}: {json_file}")
current_file += 1
# Check if the JSON has the keys we're interested in
cik = data.get('cik', None)
entity_name = data.get('entityName', None)
facts = data.get('facts', {})
# Initialize a list to hold rows
rows = []
# Skip files that don't have facts to process
if not facts:
print(f"File {json_file} has no facts to process. Skipping...")
continue
# Process the facts dynamically
for taxonomy, fact_items in facts.items():
for fact_id, fact_data in fact_items.items():
label = fact_data.get('label', None)
description = fact_data.get('description', None)
units = fact_data.get('units', {})
for unit, details_list in units.items():
for details in details_list:
# Generate row dictionary dynamically, only updating non-None values
row = {
'entity_cik': cik,
'entity_name': entity_name,
'fact_id': fact_id,
'fact_taxonomy': taxonomy,
'fact_label': label,
'fact_description': description,
'fact_unit': unit
}
# Add the remaining keys in details to row
for key, value in details.items():
row[key] = value
# Append the row to rows list
rows.append(row)
# Create DataFrame only if there are rows
if rows:
df = pd.DataFrame(rows)
# Write DataFrame to the 'data' table, appending if table exists
df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
else:
print(f"No data rows were created for file {json_file}. Skipping insert...")
except json.JSONDecodeError as e:
print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")
except Exception as e:
print(f"An error occurred while processing file {json_file}: {e}")
print("Processing complete. All files have been handled.")