Files
stockdb/write_to_db.py
2024-08-31 18:26:14 -04:00

60 lines
2.1 KiB
Python

import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Get DB connection parameters from environment
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')
# Create a connection string
connection_string = f"mariadb+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
# Create the SQLAlchemy engine
engine = create_engine(connection_string)
# Define a list of file paths and corresponding table names
file_paths = [
('sec_data/2015q1/sub.txt', 'sub'),
('sec_data/2015q1/tag.txt', 'tag'),
('sec_data/2015q1/num.txt', 'num'),
('sec_data/2015q1/pre.txt', 'pre')
]
# Loop through each file and write the data to the database
for i, (file_path, table_name) in enumerate(file_paths):
print(f"\nAnalyzing {file_path} (File {i+1}/4)...")
# Read the data into a Pandas DataFrame
df = pd.read_csv(file_path, sep='\t')
# Inspect the DataFrame
print("First rows of the DataFrame:")
print(df.head(10))
# Get the DataFrame Information
print("\nSummary Information:")
print(df.info())
# Check if there are any missing values in the DataFrame
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)
# If the file being processed is 'num.txt', fix the `coreg` column
if table_name == 'num':
df['coreg'] = df['coreg'].fillna('nocoreg')
print("\nUpdated 'coreg' column (NaN values replaced with 'nocoreg'):")
print(df[['coreg']].head(10)) # Display first 10 rows of the 'coreg' column for verification
# Write the DataFrame to the corresponding table in the MariaDB database
df.to_sql(table_name, con=engine, if_exists='append', index=False)
print(f"\nData from {file_path} written to the '{table_name}' table in the database.")
print("\nAll files have been processed and written to the database.")