Saturday, May 31, 2025

JsonSchemato Excel with Fuzzy Source

 import json

import pandas as pd

from rapidfuzz import process, fuzz


def flatten_json_schema(schema, parent_key=''):

    """Flatten nested JSON schema properties using dot notation"""

    items = {}

    properties = schema.get('properties', {})

    for k, v in properties.items():

        full_key = f"{parent_key}.{k}" if parent_key else k

        if v.get('type') == 'object' and 'properties' in v:

            items.update(flatten_json_schema(v, full_key))

        else:

            items[full_key] = v

    return items


def find_best_match(field_name, candidates, threshold=80):

    """Return best fuzzy match from candidates if above threshold"""

    if not candidates:

        return None, 0

    match, score, _ = process.extractOne(field_name, candidates, scorer=fuzz.ratio)

    if score >= threshold:

        return match, score

    return None, 0


def json_schema_to_excel_with_fuzzy_source(schema_path, source_excel_path, output_excel_path):

    # Load JSON Schema

    with open(schema_path, 'r') as f:

        schema = json.load(f)

    

    # Flatten nested schema

    flat_props = flatten_json_schema(schema)

    

    # Load source Excel metadata, strip columns and strings

    source_df = pd.read_excel(source_excel_path)

    source_df.columns = source_df.columns.str.strip()

    source_df = source_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    

    # Prepare lists for matching

    source_fields = source_df['Field Name'].dropna().unique().tolist() if 'Field Name' in source_df.columns else []

    source_titles = source_df['Title'].dropna().unique().tolist() if 'Title' in source_df.columns else []

    source_aliases = source_df['Alias'].dropna().unique().tolist() if 'Alias' in source_df.columns else []

    

    rows = []

    required_fields = schema.get('required', [])

    

    for json_field, definition in flat_props.items():

        # First try exact match on Field Name

        matched_row = None

        if json_field in source_fields:

            matched_row = source_df[source_df['Field Name'] == json_field].iloc[0]

        else:

            # Try fuzzy match on Field Name

            best_field, score = find_best_match(json_field, source_fields)

            if best_field:

                matched_row = source_df[source_df['Field Name'] == best_field].iloc[0]

            else:

                # Try exact or fuzzy on Title

                best_title, score = find_best_match(json_field, source_titles)

                if best_title:

                    matched_row = source_df[source_df['Title'] == best_title].iloc[0]

                else:

                    # Try exact or fuzzy on Alias

                    best_alias, score = find_best_match(json_field, source_aliases)

                    if best_alias:

                        matched_row = source_df[source_df['Alias'] == best_alias].iloc[0]

        

        # Prepare row with source metadata or empty if no match

        row = {

            "Source File Name": matched_row['Source File Name'] if matched_row is not None and 'Source File Name' in matched_row else '',

            "Source Field Name": matched_row['Field Name'] if matched_row is not None and 'Field Name' in matched_row else '',

            "Source Data Type": matched_row['Source Data Type'] if matched_row is not None and 'Source Data Type' in matched_row else '',

            "Source Scale": matched_row['Source Scale'] if matched_row is not None and 'Source Scale' in matched_row else '',

            "Source PK": matched_row['Source PK'] if matched_row is not None and 'Source PK' in matched_row else '',

            "Source Nullability": matched_row['Source Nullability'] if matched_row is not None and 'Source Nullability' in matched_row else '',

            "Source Data Sample": matched_row['Source Data Sample'] if matched_row is not None and 'Source Data Sample' in matched_row else '',

            # JSON Schema metadata

            "Field Name": json_field,

            "Title": definition.get("title", ""),

            "Description": definition.get("description", ""),

            "Type": definition.get("type", ""),

            "MaxLength": definition.get("maxLength", ""),

            "Enum": ", ".join(definition.get("enum", [])) if definition.get("enum") else "",

            "Required": json_field in required_fields

        }

        rows.append(row)

    

    # Export to Excel

    df_out = pd.DataFrame(rows)

    df_out.to_excel(output_excel_path, index=False)

    print(f"Metadata Excel generated with fuzzy matching at: {output_excel_path}")


# Example usage

# json_schema_to_excel_with_fuzzy_source('Output_Schema.json', 'source_metadata.xlsx', 'merged_metadata.xlsx')


No comments:

Post a Comment