feat: now use batches but seem to write less rows, not sure

2024-08-31 00:50:17 -04:00
parent 9c5afa7670
commit ab8f1fcf8f
1 changed files with 59 additions and 53 deletions
@@ -26,6 +26,10 @@ json_files = glob.glob('./sec_data/companyfacts/*.json')
 file_count = len(json_files)
 current_file = 1

+# Batch size configuration - process `batch_size` files at a time
+batch_size = 50  # Adjust this number to your preference or based on system resources
+rows = []
+
 # Data type mapping for the DataFrame to SQL conversion
 dtype_map = {
    'entity_cik': Integer,
@@ -46,7 +50,11 @@ dtype_map = {
    'frame': String(255)
 }

-for json_file in json_files:
+# Iterate through the JSON files in batches
+for i in range(0, file_count, batch_size):
+    batch_files = json_files[i:i+batch_size]
+    
+    for json_file in batch_files:
        try:
            # Load the JSON data
            with open(json_file) as f:
@@ -61,9 +69,6 @@ for json_file in json_files:
            entity_name = data.get('entityName', None)
            facts = data.get('facts', {})

-        # Initialize a list to hold rows
-        rows = []
-
            # Skip files that don't have facts to process
            if not facts:
                print(f"File {json_file} has no facts to process. Skipping...")
@@ -96,19 +101,20 @@ for json_file in json_files:
                            # Append the row to rows list
                            rows.append(row)

-        # Create DataFrame only if there are rows
-        if rows:
-            df = pd.DataFrame(rows)
-
-            # Write DataFrame to the 'data' table, appending if table exists
-            df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
-        else:
-            print(f"No data rows were created for file {json_file}. Skipping insert...")
-
        except json.JSONDecodeError as e:
            print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")

        except Exception as e:
            print(f"An error occurred while processing file {json_file}: {e}")

+    # After processing batch_files, insert accumulated rows into the database
+    if rows:
+        df = pd.DataFrame(rows)
+
+        # Write DataFrame to the 'data' table, appending if table exists
+        df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
+
+        # Clear the list of rows for the next batch
+        rows.clear()
+
 print("Processing complete. All files have been handled.")