This commit is contained in:
Leonard Excoffier
2024-08-31 19:50:15 -04:00
parent fa64f81cb8
commit 162106c8e0

View File

@@ -1,7 +1,9 @@
import os import os
import pandas as pd import pandas as pd
from sqlalchemy import create_engine from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.dialects.mysql import insert
from dotenv import load_dotenv from dotenv import load_dotenv
import numpy as np
# Load environment variables from .env file # Load environment variables from .env file
load_dotenv() load_dotenv()
@@ -27,6 +29,9 @@ file_paths = [
('sec_data/2015q1/pre.txt', 'pre', ['adsh', 'report', 'line']) ('sec_data/2015q1/pre.txt', 'pre', ['adsh', 'report', 'line'])
] ]
# Initialize metadata
metadata = MetaData()
# Loop through each file and write the data to the database # Loop through each file and write the data to the database
for i, (file_path, table_name, primary_keys) in enumerate(file_paths): for i, (file_path, table_name, primary_keys) in enumerate(file_paths):
print(f"\nAnalyzing {file_path} (File {i+1}/4)...") print(f"\nAnalyzing {file_path} (File {i+1}/4)...")
@@ -44,18 +49,41 @@ for i, (file_path, table_name, primary_keys) in enumerate(file_paths):
print("\nUpdated 'coreg' column (NaN values replaced with 'nocoreg'):") print("\nUpdated 'coreg' column (NaN values replaced with 'nocoreg'):")
print(df[['coreg']].head(10)) # Display first 10 rows of the 'coreg' column for verification print(df[['coreg']].head(10)) # Display first 10 rows of the 'coreg' column for verification
# Dropping rows with any missing values in the primary keys and NOT NULL columns # Dropping rows with any missing values in the primary keys
df.dropna(subset=primary_keys, inplace=True) df.dropna(subset=primary_keys, inplace=True)
# Dropping duplicate rows based on primary keys # Dropping duplicate rows based on primary keys
df.drop_duplicates(subset=primary_keys, keep='first', inplace=True) # df.drop_duplicates(subset=primary_keys, keep='first', inplace=True)
# Replace NaN values with None to ensure compatibility with SQL NULL
df = df.replace([np.nan, np.inf, -np.inf], None)
# Get Updated Information # Get Updated Information
print("\nUpdated Information:") print("\nUpdated Information:")
print(df.info()) print(df.info())
# Write the cleaned DataFrame to the corresponding table in the MariaDB database # Reflect the already existing table from the database schema
df.to_sql(table_name, con=engine, if_exists='append', index=False) table = Table(table_name, metadata, autoload_with=engine)
print(f"\nCleaned data from {file_path} has been written to the '{table_name}' table in the database.\n")
# Perform Upsert operation for each row in the DataFrame
with engine.connect() as conn:
for row in df.itertuples(index=False):
# Create a dictionary of the row data
data = {key: getattr(row, key) for key in df.columns}
# Prepare insert statement using SQLAlchemy with MySQL-specific ON DUPLICATE KEY UPDATE
insert_stmt = insert(table).values(**data)
# Construct the `ON DUPLICATE KEY UPDATE` part
update_stmt = insert_stmt.on_duplicate_key_update(
{col.name: insert_stmt.inserted[col.name] for col in table.columns}
)
# Execute the upsert statement
conn.execute(update_stmt)
print(f"\nCleaned data from {file_path} has been written to the '{table_name}' table in the database with upsert functionality.\n")
print("\nAll files have been processed and cleaned data has been written to the database.") print("\nAll files have been processed and cleaned data has been written to the database.")
#FIXME: Foreign key missing because usgapp is in the past constantly, Q1 gaap is based on the year before gaap.