import json
import pandas as pd
from rapidfuzz import process, fuzz
def flatten_json_schema(schema, parent_key=''):
"""Flatten nested JSON schema properties using dot notation"""
items = {}
properties = schema.get('properties', {})
for k, v in properties.items():
full_key = f"{parent_key}.{k}" if parent_key else k
if v.get('type') == 'object' and 'properties' in v:
items.update(flatten_json_schema(v, full_key))
else:
items[full_key] = v
return items
def find_best_match(field_name, candidates, threshold=80):
"""Return best fuzzy match from candidates if above threshold"""
if not candidates:
return None, 0
match, score, _ = process.extractOne(field_name, candidates, scorer=fuzz.ratio)
if score >= threshold:
return match, score
return None, 0
def json_schema_to_excel_with_fuzzy_source(schema_path, source_excel_path, output_excel_path):
# Load JSON Schema
with open(schema_path, 'r') as f:
schema = json.load(f)
# Flatten nested schema
flat_props = flatten_json_schema(schema)
# Load source Excel metadata, strip columns and strings
source_df = pd.read_excel(source_excel_path)
source_df.columns = source_df.columns.str.strip()
source_df = source_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Prepare lists for matching
source_fields = source_df['Field Name'].dropna().unique().tolist() if 'Field Name' in source_df.columns else []
source_titles = source_df['Title'].dropna().unique().tolist() if 'Title' in source_df.columns else []
source_aliases = source_df['Alias'].dropna().unique().tolist() if 'Alias' in source_df.columns else []
rows = []
required_fields = schema.get('required', [])
for json_field, definition in flat_props.items():
# First try exact match on Field Name
matched_row = None
if json_field in source_fields:
matched_row = source_df[source_df['Field Name'] == json_field].iloc[0]
else:
# Try fuzzy match on Field Name
best_field, score = find_best_match(json_field, source_fields)
if best_field:
matched_row = source_df[source_df['Field Name'] == best_field].iloc[0]
else:
# Try exact or fuzzy on Title
best_title, score = find_best_match(json_field, source_titles)
if best_title:
matched_row = source_df[source_df['Title'] == best_title].iloc[0]
else:
# Try exact or fuzzy on Alias
best_alias, score = find_best_match(json_field, source_aliases)
if best_alias:
matched_row = source_df[source_df['Alias'] == best_alias].iloc[0]
# Prepare row with source metadata or empty if no match
row = {
"Source File Name": matched_row['Source File Name'] if matched_row is not None and 'Source File Name' in matched_row else '',
"Source Field Name": matched_row['Field Name'] if matched_row is not None and 'Field Name' in matched_row else '',
"Source Data Type": matched_row['Source Data Type'] if matched_row is not None and 'Source Data Type' in matched_row else '',
"Source Scale": matched_row['Source Scale'] if matched_row is not None and 'Source Scale' in matched_row else '',
"Source PK": matched_row['Source PK'] if matched_row is not None and 'Source PK' in matched_row else '',
"Source Nullability": matched_row['Source Nullability'] if matched_row is not None and 'Source Nullability' in matched_row else '',
"Source Data Sample": matched_row['Source Data Sample'] if matched_row is not None and 'Source Data Sample' in matched_row else '',
# JSON Schema metadata
"Field Name": json_field,
"Title": definition.get("title", ""),
"Description": definition.get("description", ""),
"Type": definition.get("type", ""),
"MaxLength": definition.get("maxLength", ""),
"Enum": ", ".join(definition.get("enum", [])) if definition.get("enum") else "",
"Required": json_field in required_fields
}
rows.append(row)
# Export to Excel
df_out = pd.DataFrame(rows)
df_out.to_excel(output_excel_path, index=False)
print(f"Metadata Excel generated with fuzzy matching at: {output_excel_path}")
# Example usage
# json_schema_to_excel_with_fuzzy_source('Output_Schema.json', 'source_metadata.xlsx', 'merged_metadata.xlsx')
No comments:
Post a Comment