feat: now use batches but seem to write less rows, not sure
This commit is contained in:
112
new_write.py
112
new_write.py
@@ -26,6 +26,10 @@ json_files = glob.glob('./sec_data/companyfacts/*.json')
|
||||
file_count = len(json_files)
|
||||
current_file = 1
|
||||
|
||||
# Batch size configuration - process `batch_size` files at a time
|
||||
batch_size = 50 # Adjust this number to your preference or based on system resources
|
||||
rows = []
|
||||
|
||||
# Data type mapping for the DataFrame to SQL conversion
|
||||
dtype_map = {
|
||||
'entity_cik': Integer,
|
||||
@@ -46,69 +50,71 @@ dtype_map = {
|
||||
'frame': String(255)
|
||||
}
|
||||
|
||||
for json_file in json_files:
|
||||
try:
|
||||
# Load the JSON data
|
||||
with open(json_file) as f:
|
||||
data = json.load(f)
|
||||
# Iterate through the JSON files in batches
|
||||
for i in range(0, file_count, batch_size):
|
||||
batch_files = json_files[i:i+batch_size]
|
||||
|
||||
for json_file in batch_files:
|
||||
try:
|
||||
# Load the JSON data
|
||||
with open(json_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Informing the user about the current file being processed
|
||||
print(f"Processing file {current_file}/{file_count}: {json_file}")
|
||||
current_file += 1
|
||||
# Informing the user about the current file being processed
|
||||
print(f"Processing file {current_file}/{file_count}: {json_file}")
|
||||
current_file += 1
|
||||
|
||||
# Check if the JSON has the keys we're interested in
|
||||
cik = data.get('cik', None)
|
||||
entity_name = data.get('entityName', None)
|
||||
facts = data.get('facts', {})
|
||||
# Check if the JSON has the keys we're interested in
|
||||
cik = data.get('cik', None)
|
||||
entity_name = data.get('entityName', None)
|
||||
facts = data.get('facts', {})
|
||||
|
||||
# Initialize a list to hold rows
|
||||
rows = []
|
||||
# Skip files that don't have facts to process
|
||||
if not facts:
|
||||
print(f"File {json_file} has no facts to process. Skipping...")
|
||||
continue
|
||||
|
||||
# Skip files that don't have facts to process
|
||||
if not facts:
|
||||
print(f"File {json_file} has no facts to process. Skipping...")
|
||||
continue
|
||||
# Process the facts dynamically
|
||||
for taxonomy, fact_items in facts.items():
|
||||
for fact_id, fact_data in fact_items.items():
|
||||
label = fact_data.get('label', None)
|
||||
description = fact_data.get('description', None)
|
||||
units = fact_data.get('units', {})
|
||||
|
||||
# Process the facts dynamically
|
||||
for taxonomy, fact_items in facts.items():
|
||||
for fact_id, fact_data in fact_items.items():
|
||||
label = fact_data.get('label', None)
|
||||
description = fact_data.get('description', None)
|
||||
units = fact_data.get('units', {})
|
||||
for unit, details_list in units.items():
|
||||
for details in details_list:
|
||||
# Generate row dictionary dynamically, only updating non-None values
|
||||
row = {
|
||||
'entity_cik': cik,
|
||||
'entity_name': entity_name,
|
||||
'fact_id': fact_id,
|
||||
'fact_taxonomy': taxonomy,
|
||||
'fact_label': label,
|
||||
'fact_description': description,
|
||||
'fact_unit': unit
|
||||
}
|
||||
|
||||
for unit, details_list in units.items():
|
||||
for details in details_list:
|
||||
# Generate row dictionary dynamically, only updating non-None values
|
||||
row = {
|
||||
'entity_cik': cik,
|
||||
'entity_name': entity_name,
|
||||
'fact_id': fact_id,
|
||||
'fact_taxonomy': taxonomy,
|
||||
'fact_label': label,
|
||||
'fact_description': description,
|
||||
'fact_unit': unit
|
||||
}
|
||||
# Add the remaining keys in details to row
|
||||
for key, value in details.items():
|
||||
row[key] = value
|
||||
|
||||
# Append the row to rows list
|
||||
rows.append(row)
|
||||
|
||||
# Add the remaining keys in details to row
|
||||
for key, value in details.items():
|
||||
row[key] = value
|
||||
|
||||
# Append the row to rows list
|
||||
rows.append(row)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")
|
||||
|
||||
# Create DataFrame only if there are rows
|
||||
if rows:
|
||||
df = pd.DataFrame(rows)
|
||||
except Exception as e:
|
||||
print(f"An error occurred while processing file {json_file}: {e}")
|
||||
|
||||
# Write DataFrame to the 'data' table, appending if table exists
|
||||
df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
|
||||
else:
|
||||
print(f"No data rows were created for file {json_file}. Skipping insert...")
|
||||
# After processing batch_files, insert accumulated rows into the database
|
||||
if rows:
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Failed to process file {json_file}: Invalid JSON format. Error: {e}")
|
||||
# Write DataFrame to the 'data' table, appending if table exists
|
||||
df.to_sql('data', con=engine, if_exists='append', index=False, dtype=dtype_map)
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred while processing file {json_file}: {e}")
|
||||
# Clear the list of rows for the next batch
|
||||
rows.clear()
|
||||
|
||||
print("Processing complete. All files have been handled.")
|
||||
Reference in New Issue
Block a user