diff --git a/db_schema.sql b/db_schema.sql index b74679b..5f17822 100644 --- a/db_schema.sql +++ b/db_schema.sql @@ -38,4 +38,7 @@ CREATE TABLE IF NOT EXISTS data ( fact_label VARCHAR(255), fact_description TEXT, fact_unit VARCHAR(255) -) \ No newline at end of file +) + +-- @block +CREATE TABLE IF NOT EXISTS data (); \ No newline at end of file diff --git a/new_write.py b/new_write.py index e0bb710..d7ab3b0 100644 --- a/new_write.py +++ b/new_write.py @@ -1,26 +1,40 @@ import json import pandas as pd +from sqlalchemy import create_engine +from sqlalchemy.types import Integer, String, Float +from dotenv import load_dotenv +import os -# Step 1: Load the JSON data -with open('CIK0000320193.json') as f: +# Load environment variables from .env file +load_dotenv() + +# Database connection +DB_USER = os.getenv('DB_USER') +DB_PASSWORD = os.getenv('DB_PASSWORD') +DB_HOST = os.getenv('DB_HOST') +DB_PORT = os.getenv('DB_PORT') +DB_NAME = os.getenv('DB_NAME') + +# Create the connection string +db_connection_str = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}' +engine = create_engine(db_connection_str) + +# Load the JSON data as before +with open('CIK0001937441.json') as f: data = json.load(f) -# Step 2: Extract the relevant fields cik = data.get('cik') entity_name = data.get('entityName') facts = data.get('facts') -# Prepare a list to hold the rows rows = [] -# Traverse through the facts dictionary for taxonomy, fact_items in facts.items(): for fact_id, fact_data in fact_items.items(): label = fact_data.get('label') description = fact_data.get('description') units = fact_data.get('units') - # For each unit and its details, add a new row for unit, details_list in units.items(): for details in details_list: row = { @@ -32,15 +46,33 @@ for taxonomy, fact_items in facts.items(): 'fact_description': description, 'fact_unit': unit } - - # Include the additional details in the row row.update(details) - - # Append the row to the rows list rows.append(row) -# Step 3: Create the DataFrame +# Create DataFrame from collected rows df = pd.DataFrame(rows) -# Step 4: Output the head of the DataFrame -print(df.head()) \ No newline at end of file +# Define DataFrame to SQL types (optional, but recommended for performance) +dtype_map = { + 'entity_cik': String(10), + 'entity_name': String(255), + 'fact_id': String(255), + 'fact_taxonomy': String(255), + 'fact_label': String(255), + 'fact_description': String(255), + 'fact_unit': String(50), + 'start': String(10), + 'end': String(10), + 'val': Float, + 'accn': String(50), + 'fy': Integer, + 'fp': String(10), + 'form': String(10), + 'filed': String(10), + 'frame': String(50) +} + +# Write DataFrame to the 'data' table in the MariaDB database, create the table if it doesn't exist +df.to_sql('data', con=engine, if_exists='replace', index=False, dtype=dtype_map) + +print("Data successfully written to the database.") \ No newline at end of file