codingbyexamples: Field match report with JSON schema

import json

import pandas as pd

from rapidfuzz import process, fuzz

def flatten_json_schema(schema, parent_key=''):

fields = []

for key, value in schema.get('properties', {}).items():

full_key = f"{parent_key}.{key}" if parent_key else key

if value.get('type') == 'object' and 'properties' in value:

fields.extend(flatten_json_schema(value, full_key))

else:

fields.append(full_key)

return fields

def match_fields(source_fields, schema_fields, threshold=85):

match_results = []

for field in source_fields:

if field in schema_fields:

match_results.append((field, field, 100, 'Exact'))

else:

best_match, score, _ = process.extractOne(field, schema_fields, scorer=fuzz.ratio)

if score >= threshold:

match_results.append((field, best_match, score, 'Fuzzy'))

else:

match_results.append((field, '', score, 'Unmatched'))

return pd.DataFrame(match_results, columns=['Source Field', 'Matched Field', 'Match Score', 'Match Type'])

# --- Example usage ---

def validate_fields_against_schema(schema_path, excel_path, sheet_name=None, field_column='Field Name', threshold=85):

with open(schema_path, 'r') as f:

schema = json.load(f)

df = pd.read_excel(excel_path, sheet_name=sheet_name)

if field_column not in df.columns:

raise ValueError(f"'{field_column}' column not found in Excel.")

source_fields = df[field_column].dropna().astype(str).tolist()

schema_fields = flatten_json_schema(schema)

result_df = match_fields(source_fields, schema_fields, threshold=threshold)

return result_df

# ---- To run the script standalone ----

if __name__ == "__main__":

# Update paths here

schema_file = 'Output_Schema.json'

excel_file = 'Source_Metadata.xlsx'

result = validate_fields_against_schema(schema_file, excel_file)

print(result)

result.to_excel('Field_Match_Report.xlsx', index=False)

codingbyexamples