Saturday, May 31, 2025

Field match report with JSON schema

 import json

import pandas as pd

from rapidfuzz import process, fuzz


def flatten_json_schema(schema, parent_key=''):

    fields = []

    for key, value in schema.get('properties', {}).items():

        full_key = f"{parent_key}.{key}" if parent_key else key

        if value.get('type') == 'object' and 'properties' in value:

            fields.extend(flatten_json_schema(value, full_key))

        else:

            fields.append(full_key)

    return fields


def match_fields(source_fields, schema_fields, threshold=85):

    match_results = []

    for field in source_fields:

        if field in schema_fields:

            match_results.append((field, field, 100, 'Exact'))

        else:

            best_match, score, _ = process.extractOne(field, schema_fields, scorer=fuzz.ratio)

            if score >= threshold:

                match_results.append((field, best_match, score, 'Fuzzy'))

            else:

                match_results.append((field, '', score, 'Unmatched'))

    return pd.DataFrame(match_results, columns=['Source Field', 'Matched Field', 'Match Score', 'Match Type'])


# --- Example usage ---

def validate_fields_against_schema(schema_path, excel_path, sheet_name=None, field_column='Field Name', threshold=85):

    with open(schema_path, 'r') as f:

        schema = json.load(f)


    df = pd.read_excel(excel_path, sheet_name=sheet_name)

    if field_column not in df.columns:

        raise ValueError(f"'{field_column}' column not found in Excel.")


    source_fields = df[field_column].dropna().astype(str).tolist()

    schema_fields = flatten_json_schema(schema)


    result_df = match_fields(source_fields, schema_fields, threshold=threshold)

    return result_df


# ---- To run the script standalone ----

if __name__ == "__main__":

    # Update paths here

    schema_file = 'Output_Schema.json'

    excel_file = 'Source_Metadata.xlsx'


    result = validate_fields_against_schema(schema_file, excel_file)

    print(result)

    result.to_excel('Field_Match_Report.xlsx', index=False)


No comments:

Post a Comment