import json
import pandas as pd
from rapidfuzz import process, fuzz
def flatten_json_schema(schema, parent_key=''):
fields = []
for key, value in schema.get('properties', {}).items():
full_key = f"{parent_key}.{key}" if parent_key else key
if value.get('type') == 'object' and 'properties' in value:
fields.extend(flatten_json_schema(value, full_key))
else:
fields.append(full_key)
return fields
def match_fields(source_fields, schema_fields, threshold=85):
match_results = []
for field in source_fields:
if field in schema_fields:
match_results.append((field, field, 100, 'Exact'))
else:
best_match, score, _ = process.extractOne(field, schema_fields, scorer=fuzz.ratio)
if score >= threshold:
match_results.append((field, best_match, score, 'Fuzzy'))
else:
match_results.append((field, '', score, 'Unmatched'))
return pd.DataFrame(match_results, columns=['Source Field', 'Matched Field', 'Match Score', 'Match Type'])
# --- Example usage ---
def validate_fields_against_schema(schema_path, excel_path, sheet_name=None, field_column='Field Name', threshold=85):
with open(schema_path, 'r') as f:
schema = json.load(f)
df = pd.read_excel(excel_path, sheet_name=sheet_name)
if field_column not in df.columns:
raise ValueError(f"'{field_column}' column not found in Excel.")
source_fields = df[field_column].dropna().astype(str).tolist()
schema_fields = flatten_json_schema(schema)
result_df = match_fields(source_fields, schema_fields, threshold=threshold)
return result_df
# ---- To run the script standalone ----
if __name__ == "__main__":
# Update paths here
schema_file = 'Output_Schema.json'
excel_file = 'Source_Metadata.xlsx'
result = validate_fields_against_schema(schema_file, excel_file)
print(result)
result.to_excel('Field_Match_Report.xlsx', index=False)
No comments:
Post a Comment