import os import pandas as pd from sqlalchemy import create_engine from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Get DB connection parameters from environment DB_USER = os.getenv('DB_USER') DB_PASSWORD = os.getenv('DB_PASSWORD') DB_HOST = os.getenv('DB_HOST') DB_PORT = os.getenv('DB_PORT') DB_NAME = os.getenv('DB_NAME') # Create a connection string connection_string = f"mariadb+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" # Create the SQLAlchemy engine engine = create_engine(connection_string) # Define a list of file paths and corresponding table names file_paths = [ ('sec_data/2024q1/num.txt', 'num'), ('sec_data/2024q1/pre.txt', 'pre'), ('sec_data/2024q1/sub.txt', 'sub'), ('sec_data/2024q1/tag.txt', 'tag') ] # Loop through each file and write the data to the database for i, (file_path, table_name) in enumerate(file_paths): print(f"\nAnalyzing {file_path} (File {i+1}/4)...") # Read the data into a Pandas DataFrame df = pd.read_csv(file_path, sep='\t') # Inspect the DataFrame print("First rows of the DataFrame:") print(df.head(10)) # Get the DataFrame Information print("\nSummary Information:") print(df.info()) # Check if there are any missing values in the DataFrame missing_values = df.isnull().sum() print("\nMissing Values:") print(missing_values) # Write the DataFrame to the corresponding table in the MariaDB database df.to_sql(table_name, con=engine, if_exists='replace', index=False) print(f"\nData from {file_path} written to the '{table_name}' table in the database.") print("\nAll files have been processed and written to the database.")