diff --git a/main.py b/data_dl.py similarity index 83% rename from main.py rename to data_dl.py index 33cbf33..3e3a598 100644 --- a/main.py +++ b/data_dl.py @@ -48,13 +48,13 @@ SUBMISSIONS_URL = ( ) # File paths to save the zip files -companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip") -#submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip") +#companyfacts_zip = os.path.join(SEC_DATA_DIR, "companyfacts.zip") +submissions_zip = os.path.join(SEC_DATA_DIR, "submissions.zip") # Download the files -download_file(COMPANYFACTS_URL, companyfacts_zip) -#download_file(SUBMISSIONS_URL, submissions_zip) +#download_file(COMPANYFACTS_URL, companyfacts_zip) +download_file(SUBMISSIONS_URL, submissions_zip) # Extract the files into respective directories -extract_zip(companyfacts_zip, COMPANYFACTS_DIR) -#extract_zip(submissions_zip, SUBMISSIONS_DIR) +#extract_zip(companyfacts_zip, COMPANYFACTS_DIR) +extract_zip(submissions_zip, SUBMISSIONS_DIR) diff --git a/db_schema.sql b/db_schema.sql index 28d1845..a534358 100644 --- a/db_schema.sql +++ b/db_schema.sql @@ -1,55 +1,75 @@ -CREATE TABLE IF NOT EXISTS entities ( - cik INT PRIMARY KEY, -- CIK is now the primary key, ensuring uniqueness - name VARCHAR(255) NOT NULL -- Name of the company +CREATE TABLE num ( + adsh VARCHAR(255), + tag VARCHAR(255), + version VARCHAR(255), + coreg VARCHAR(255), + ddate BIGINT, + qtrs BIGINT, + uom VARCHAR(50), + value DOUBLE PRECISION, + footnote TEXT ); -CREATE TABLE IF NOT EXISTS facts ( - id VARCHAR(255) PRIMARY KEY, -- Unique identifier for the fact - taxonomy VARCHAR(255), -- Taxonomy of the fact - label VARCHAR(255), -- Label of the fact - description TEXT, -- Description of the fact - unit VARCHAR(255) -- Unit of the fact +CREATE TABLE pre ( + adsh VARCHAR(255), + report BIGINT, + line BIGINT, + stmt VARCHAR(255), + inpth BIGINT, + rfile VARCHAR(255), + tag VARCHAR(255), + version VARCHAR(255), + plabel VARCHAR(255), + negating BIGINT ); -CREATE TABLE IF NOT EXISTS data ( - cik INT, -- CIK of the company - fact_id VARCHAR(255), - end DATE, - start DATE, -- Start date of the fact - val INT, - accn VARCHAR(255), - fy INT, - fp VARCHAR(255), - form VARCHAR(255), - filed DATE, - frame VARCHAR(255), - PRIMARY KEY (cik, fact_id, end), - FOREIGN KEY (cik) REFERENCES entities(cik), - FOREIGN KEY (fact_id) REFERENCES facts(id) +CREATE TABLE sub ( + adsh VARCHAR(255), + cik BIGINT, + name VARCHAR(255), + sic DOUBLE PRECISION, + countryba VARCHAR(100), + stprba VARCHAR(100), + cityba VARCHAR(255), + zipba VARCHAR(50), + bas1 VARCHAR(255), + bas2 VARCHAR(255), + baph VARCHAR(255), + countryma VARCHAR(100), + stprma VARCHAR(100), + cityma VARCHAR(255), + zipma VARCHAR(50), + mas1 VARCHAR(255), + mas2 VARCHAR(255), + countryinc VARCHAR(100), + stprinc VARCHAR(100), + ein BIGINT, + former VARCHAR(255), + changed DOUBLE PRECISION, + afs VARCHAR(255), + wksi BIGINT, + fye DOUBLE PRECISION, + form VARCHAR(50), + period DOUBLE PRECISION, + fy DOUBLE PRECISION, + fp VARCHAR(50), + filed BIGINT, + accepted VARCHAR(255), + prevrpt BIGINT, + detail BIGINT, + instance VARCHAR(255), + nciks BIGINT, + aciks VARCHAR(255) ); - --- @block -CREATE TABLE IF NOT EXISTS data ( - entity_cik INT, - entity_name VARCHAR(255), - fact_id VARCHAR(255), - fact_taxonomy VARCHAR(255), - fact_label VARCHAR(255), - fact_description TEXT, - fact_unit VARCHAR(255) - end DATE, - val FLOAT, - accn VARCHAR(50), - fy INT, - fp VARCHAR(255), - form VARCHAR(255), - filed DATE, - frame VARCHAR(255), - start DATE, - PRIMARY KEY (entity_cik, fact_id, end, form), - -) - --- @block -CREATE TABLE IF NOT EXISTS data (); +CREATE TABLE tag ( + tag VARCHAR(255), + version VARCHAR(255), + custom BIGINT, + abstract BIGINT, + datatype VARCHAR(255), + iord VARCHAR(50), + crdr VARCHAR(50), + tlabel VARCHAR(255), + doc TEXT +); \ No newline at end of file diff --git a/write_to_db.py b/write_to_db.py index b410bf9..456818e 100644 --- a/write_to_db.py +++ b/write_to_db.py @@ -1,129 +1,18 @@ -import os -import mariadb -import json -from dotenv import load_dotenv +import pandas as pd -# Load environment variables from .env file -load_dotenv() +# Read the data into a Pandas DataFrame +file_path = 'sec_data/2024q1/tag.txt' +df = pd.read_csv(file_path, sep='\t') -def connect_to_db(): - try: - # Read the connection parameters from the environment - conn = mariadb.connect( - user=os.getenv("DB_USER"), - password=os.getenv("DB_PASSWORD"), - host=os.getenv("DB_HOST"), - port=int(os.getenv("DB_PORT")), - database=os.getenv("DB_NAME") - ) - return conn - except mariadb.Error as e: - print(f"Error connecting to MariaDB: {e}") - return None +# Inspect the DataFrame +print("First rows of the DataFrame:") +print(df.head(10)) -def insert_entity(cursor, cik, entity_name): - cursor.execute( - "INSERT IGNORE INTO entities (cik, name) VALUES (?, ?)", (cik, entity_name)) +# Get the DataFrame Information +print("\nSummary Information:") +print(df.info()) -def insert_fact(cursor, taxonomy, fact_id, label, description, unit): - cursor.execute( - "INSERT IGNORE INTO facts (id, taxonomy, label, description, unit) VALUES (?, ?, ?, ?, ?)", - (fact_id, taxonomy, label, description, unit) - ) - -def insert_data(cursor, cik, fact_id, start, end, val, accn, fy, fp, form, filed, frame): - cursor.execute( - """INSERT IGNORE INTO data (cik, fact_id, end, start, val, accn, fy, fp, form, filed, frame) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - (cik, fact_id, end, start, val, accn, fy, fp, form, filed, frame) - ) - -def cik_exists(cursor, cik): - cursor.execute("SELECT 1 FROM entities WHERE cik = ?", (cik,)) - return cursor.fetchone() is not None - -def parse_json_and_insert_data(file_path): - with open(file_path, 'r') as file: - data = json.load(file) - - cik = data.get('cik') - - # Start a new connection for each file - conn = connect_to_db() - if conn is None: - return False - - try: - cursor = conn.cursor() - - # Optional: Check if cik already exists in the database. - # You can comment this block out if you do not want this check. - if cik_exists(cursor, cik): - print(f"CIK {cik} already exists in the database. Skipping file {file_path}.") - return False - - # Insert the entity - entity_name = data.get('entityName') - insert_entity(cursor, cik, entity_name) - - # Iterate over facts - for taxonomy, fact_details in data['facts'].items(): - for fact_id, fact in fact_details.items(): - # Get fact details - label = fact.get('label') - description = fact.get('description') - - for unit, unit_vals in fact.get('units', {}).items(): - # Insert fact - insert_fact(cursor, taxonomy, fact_id, label, description, unit) - - # Insert each data point - for entry in unit_vals: - start = entry.get('start', None) - end = entry['end'] - val = entry['val'] - accn = entry['accn'] - fy = entry['fy'] - fp = entry['fp'] - form = entry['form'] - filed = entry['filed'] - frame = entry.get('frame', None) - - insert_data(cursor, cik, fact_id, start, end, val, accn, fy, fp, form, filed, frame) - - # Commit transaction - conn.commit() - return True - - except Exception as e: - print(f"Error occurred while processing {file_path}: {e}") - conn.rollback() - return False - finally: - cursor.close() - conn.close() - -def process_all_files_in_directory(directory_path): - files = [f for f in os.listdir(directory_path) if f.endswith('.json')] - total_files = len(files) - processed_files = 0 - - for idx, file_name in enumerate(files, start=1): - file_path = os.path.join(directory_path, file_name) - print(f"Processing file {idx} of {total_files}: {file_name}") - - if parse_json_and_insert_data(file_path): - processed_files += 1 - print(f"Successfully processed {file_name}") - else: - print(f"Failed to process {file_name}") - - print(f"Finished processing {processed_files} out of {total_files} files.") - -def main(): - # Process all JSON files in the directory - directory_path = './sec_data/companyfacts/' - process_all_files_in_directory(directory_path) - -if __name__ == "__main__": - main() \ No newline at end of file +# Check if there are any missing values in the DataFrame +missing_values = df.isnull().sum() +print("\nMissing Values:") +print(missing_values)