codingbyexamples: JsonSchemato Excel with Fuzzy Source

import json

import pandas as pd

from rapidfuzz import process, fuzz

def flatten_json_schema(schema, parent_key=''):

"""Flatten nested JSON schema properties using dot notation"""

items = {}

properties = schema.get('properties', {})

for k, v in properties.items():

full_key = f"{parent_key}.{k}" if parent_key else k

if v.get('type') == 'object' and 'properties' in v:

items.update(flatten_json_schema(v, full_key))

else:

items[full_key] = v

return items

def find_best_match(field_name, candidates, threshold=80):

"""Return best fuzzy match from candidates if above threshold"""

if not candidates:

return None, 0

match, score, _ = process.extractOne(field_name, candidates, scorer=fuzz.ratio)

if score >= threshold:

return match, score

return None, 0

def json_schema_to_excel_with_fuzzy_source(schema_path, source_excel_path, output_excel_path):

# Load JSON Schema

with open(schema_path, 'r') as f:

schema = json.load(f)

# Flatten nested schema

flat_props = flatten_json_schema(schema)

# Load source Excel metadata, strip columns and strings

source_df = pd.read_excel(source_excel_path)

source_df.columns = source_df.columns.str.strip()

source_df = source_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Prepare lists for matching

source_fields = source_df['Field Name'].dropna().unique().tolist() if 'Field Name' in source_df.columns else []

source_titles = source_df['Title'].dropna().unique().tolist() if 'Title' in source_df.columns else []

source_aliases = source_df['Alias'].dropna().unique().tolist() if 'Alias' in source_df.columns else []

rows = []

required_fields = schema.get('required', [])

for json_field, definition in flat_props.items():

# First try exact match on Field Name

matched_row = None

if json_field in source_fields:

matched_row = source_df[source_df['Field Name'] == json_field].iloc[0]

else:

# Try fuzzy match on Field Name

best_field, score = find_best_match(json_field, source_fields)

if best_field:

matched_row = source_df[source_df['Field Name'] == best_field].iloc[0]

else:

# Try exact or fuzzy on Title

best_title, score = find_best_match(json_field, source_titles)

if best_title:

matched_row = source_df[source_df['Title'] == best_title].iloc[0]

else:

# Try exact or fuzzy on Alias

best_alias, score = find_best_match(json_field, source_aliases)

if best_alias:

matched_row = source_df[source_df['Alias'] == best_alias].iloc[0]

# Prepare row with source metadata or empty if no match

row = {

"Source File Name": matched_row['Source File Name'] if matched_row is not None and 'Source File Name' in matched_row else '',

"Source Field Name": matched_row['Field Name'] if matched_row is not None and 'Field Name' in matched_row else '',

"Source Data Type": matched_row['Source Data Type'] if matched_row is not None and 'Source Data Type' in matched_row else '',

"Source Scale": matched_row['Source Scale'] if matched_row is not None and 'Source Scale' in matched_row else '',

"Source PK": matched_row['Source PK'] if matched_row is not None and 'Source PK' in matched_row else '',

"Source Nullability": matched_row['Source Nullability'] if matched_row is not None and 'Source Nullability' in matched_row else '',

"Source Data Sample": matched_row['Source Data Sample'] if matched_row is not None and 'Source Data Sample' in matched_row else '',

# JSON Schema metadata

"Field Name": json_field,

"Title": definition.get("title", ""),

"Description": definition.get("description", ""),

"Type": definition.get("type", ""),

"MaxLength": definition.get("maxLength", ""),

"Enum": ", ".join(definition.get("enum", [])) if definition.get("enum") else "",

"Required": json_field in required_fields

}

rows.append(row)

# Export to Excel

df_out = pd.DataFrame(rows)

df_out.to_excel(output_excel_path, index=False)

print(f"Metadata Excel generated with fuzzy matching at: {output_excel_path}")

# Example usage

# json_schema_to_excel_with_fuzzy_source('Output_Schema.json', 'source_metadata.xlsx', 'merged_metadata.xlsx')

codingbyexamples

Saturday, May 31, 2025

JsonSchemato Excel with Fuzzy Source

No comments:

Post a Comment