feat: now use batches but seem to write less rows, not sure
This commit is contained in:
32
new_write.py
32
new_write.py
@@ -26,6 +26,10 @@ json_files = glob.glob('./sec_data/companyfacts/*.json')
|
|||||||
file_count = len(json_files)
|
file_count = len(json_files)
|
||||||
current_file = 1
|
current_file = 1
|
||||||
|
|
||||||
|
# Batch size configuration - process `batch_size` files at a time
|
||||||
|
batch_size = 50 # Adjust this number to your preference or based on system resources
|
||||||
|
rows = []
|
||||||
|
|
||||||
# Data type mapping for the DataFrame to SQL conversion
|
# Data type mapping for the DataFrame to SQL conversion
|
||||||
dtype_map = {
|
dtype_map = {
|
||||||
'entity_cik': Integer,
|
'entity_cik': Integer,
|
||||||
@@ -46,7 +50,11 @@ dtype_map = {
|
|||||||
'frame': String(255)
|
'frame': String(255)
|
||||||
}
|
}
|
||||||
|
|
||||||
for json_file in json_files:
|
# Iterate through the JSON files in batches
|
||||||
|
for i in range(0, file_count, batch_size):
|
||||||
|
batch_files = json_files[i:i+batch_size]
|
||||||
|
|
||||||
|
for json_file in batch_files:
|
||||||
try:
|
try:
|
||||||
# Load the JSON data
|
# Load the JSON data
|
||||||
with open(json_file) as f:
|
with open(json_file) as f:
|
||||||
@@ -61,9 +69,6 @@ for json_file in json_files:
|
|||||||
entity_name = data.get('entityName', None)
|
entity_name = data.get('entityName', None)
|
||||||
facts = data.get('facts', {})
|
facts = data.get('facts', {})
|
||||||
|
|
||||||
# Initialize a list to hold rows
|
|
||||||
rows = []
|
|
||||||
|
|
||||||
# Skip files that don't have facts to process
|
# Skip files that don't have facts to process
|
||||||
if not facts:
|
if not facts:
|
||||||
print(f"File {json_file} has no facts to process. Skipping...")
|
print(f"File {json_file} has no facts to process. Skipping...")
|
||||||
@@ -96,19 +101,20 @@ for json_file in json_files:
|
|||||||
# Append the row to rows list
|
# Append the row to rows list
|
||||||
rows.append(row)
|
rows.append(row)
|
||||||
|
|
||||||
# Create DataFrame only if there are rows
|
|
||||||
if rows:
|
|
||||||
df = pd.DataFrame(rows)
|
|
||||||
|
|
||||||
# Write DataFrame to the 'data' table, appending if table exists
|
|
||||||
df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
|
|
||||||
else:
|
|
||||||
print(f"No data rows were created for file {json_file}. Skipping insert...")
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")
|
print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An error occurred while processing file {json_file}: {e}")
|
print(f"An error occurred while processing file {json_file}: {e}")
|
||||||
|
|
||||||
|
# After processing batch_files, insert accumulated rows into the database
|
||||||
|
if rows:
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
|
||||||
|
# Write DataFrame to the 'data' table, appending if table exists
|
||||||
|
df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
|
||||||
|
|
||||||
|
# Clear the list of rows for the next batch
|
||||||
|
rows.clear()
|
||||||
|
|
||||||
print("Processing complete. All files have been handled.")
|
print("Processing complete. All files have been handled.")
|
||||||
Reference in New Issue
Block a user