codingbyexamples

Wednesday, September 3, 2025

Fund

Umbrella Fund (Global Opportunities Fund Complex)

├── Sub-Fund A: Equity Fund (U.S. & Offshore Master-Feeder)

│ ├── Feeder A1: U.S. Onshore Feeder (Delaware LP)

│ ├── Feeder A2: Offshore Feeder (Cayman Company)

│ └── Master Fund A: Cayman Master Fund

│ └── Portfolio of Securities (Equities, Options, etc.)

│

├── Sub-Fund B: Bond Fund (Standalone)

│ ├── Share Class B1: USD Institutional

│ ├── Share Class B2: EUR Institutional

│ └── Share Class B3: USD Retail

│

└── Sub-Fund C: Multi-Asset Fund (Umbrella with Share Classes only)

├── Share Class C1: Accumulating, USD

├── Share Class C2: Distributing, EUR

└── Share Class C3: Distributing, GBP

Friday, June 20, 2025

CICD

stages:

- prepare

- generate_json

- test_json

- data_dictionary

- metadata

- finalize

default:

image: python:3.11

before_script:

- pip install -r requirements.txt

- mkdir -p artifacts

variables:

EXCEL_INPUT: "data/source_metadata.xlsx"

SAMPLE_JSON: "data/sample_data.json"

OUTPUT_SCHEMA: "artifacts/generated_schema.json"

DATA_DICTIONARY: "artifacts/data_dictionary.xlsx"

STTM_OUTPUT: "artifacts/sttm.xlsx"

CODE_COLUMNS_OUTPUT: "artifacts/code_columns.xlsx"

# Step 1: Convert Excel to JSON Schema

generate_schema:

stage: generate_json

script:

- python scripts/excel_to_json_schema.py $EXCEL_INPUT $OUTPUT_SCHEMA

artifacts:

paths:

- $OUTPUT_SCHEMA

# Step 2: Validate sample data against generated JSON Schema

test_schema_with_data:

stage: test_json

script:

- python scripts/validate_json_sample.py $OUTPUT_SCHEMA $SAMPLE_JSON

dependencies:

- generate_schema

# Step 3: Generate Data Dictionary from JSON Schema

generate_data_dictionary:

stage: data_dictionary

script:

- python scripts/json_to_data_dictionary.py $OUTPUT_SCHEMA $DATA_DICTIONARY

dependencies:

- generate_schema

artifacts:

paths:

- $DATA_DICTIONARY

# Step 4: Generate STTM from Data Dictionary

generate_sttm:

stage: metadata

script:

- python scripts/data_dictionary_to_sttm.py $DATA_DICTIONARY $STTM_OUTPUT

dependencies:

- generate_data_dictionary

artifacts:

paths:

- $STTM_OUTPUT

# Step 5: Identify code-value columns

identify_code_columns:

stage: finalize

script:

- python scripts/identify_code_columns.py $DATA_DICTIONARY $CODE_COLUMNS_OUTPUT

dependencies:

- generate_data_dictionary

artifacts:

paths:

- $CODE_COLUMNS_OUTPUT

Sunday, June 15, 2025

CompareJsonSchemas

import json

from openpyxl import Workbook

from openpyxl.styles import Font

def load_json_schema(file_path):

with open(file_path, 'r') as f:

return json.load(f)

def flatten_schema_properties(schema, path=''):

props = {}

if 'properties' in schema:

for key, value in schema['properties'].items():

full_path = f"{path}.{key}" if path else key

props[full_path] = {

'type': value.get('type'),

'enum': value.get('enum'),

'format': value.get('format'),

'maxLength': value.get('maxLength'),

'minimum': value.get('minimum'),

'maximum': value.get('maximum'),

}

if value.get('type') == 'object':

nested = flatten_schema_properties(value, full_path)

props.update(nested)

return props

def compare_required_fields(schema1, schema2):

req1 = set(schema1.get('required', []))

req2 = set(schema2.get('required', []))

added = req2 - req1

removed = req1 - req2

differences = []

for field in added:

differences.append((field, 'REQUIRED_ADDED', '', 'required'))

for field in removed:

differences.append((field, 'REQUIRED_REMOVED', 'required', ''))

return differences

def compare_flattened_properties(flat1, flat2):

all_keys = set(flat1.keys()) | set(flat2.keys())

differences = []

for key in all_keys:

val1 = flat1.get(key)

val2 = flat2.get(key)

if key not in flat1:

differences.append((key, 'ADDED', '', json.dumps(val2)))

elif key not in flat2:

differences.append((key, 'REMOVED', json.dumps(val1), ''))

elif val1 != val2:

differences.append((key, 'MODIFIED', json.dumps(val1), json.dumps(val2)))

return differences

def write_differences_to_excel(differences, output_path):

wb = Workbook()

ws = wb.active

ws.title = "Schema Differences"

headers = ["Path", "Change Type", "Schema 1", "Schema 2"]

ws.append(headers)

for cell in ws[1]:

cell.font = Font(bold=True)

for diff in differences:

ws.append(diff)

wb.save(output_path)

print(f"✅ Differences written to {output_path}")

if __name__ == "__main__":

schema1_path = "schema1.json"

schema2_path = "schema2.json"

output_excel = "schema_differences.xlsx"

schema1 = load_json_schema(schema1_path)

schema2 = load_json_schema(schema2_path)

flat1 = flatten_schema_properties(schema1)

flat2 = flatten_schema_properties(schema2)

prop_diffs = compare_flattened_properties(flat1, flat2)

req_diffs = compare_required_fields(schema1, schema2)

all_diffs = prop_diffs + req_diffs

write_differences_to_excel(all_diffs, output_excel)

Saturday, May 31, 2025

Field match report with JSON schema

import json

import pandas as pd

from rapidfuzz import process, fuzz

def flatten_json_schema(schema, parent_key=''):

fields = []

for key, value in schema.get('properties', {}).items():

full_key = f"{parent_key}.{key}" if parent_key else key

if value.get('type') == 'object' and 'properties' in value:

fields.extend(flatten_json_schema(value, full_key))

else:

fields.append(full_key)

return fields

def match_fields(source_fields, schema_fields, threshold=85):

match_results = []

for field in source_fields:

if field in schema_fields:

match_results.append((field, field, 100, 'Exact'))

else:

best_match, score, _ = process.extractOne(field, schema_fields, scorer=fuzz.ratio)

if score >= threshold:

match_results.append((field, best_match, score, 'Fuzzy'))

else:

match_results.append((field, '', score, 'Unmatched'))

return pd.DataFrame(match_results, columns=['Source Field', 'Matched Field', 'Match Score', 'Match Type'])

# --- Example usage ---

def validate_fields_against_schema(schema_path, excel_path, sheet_name=None, field_column='Field Name', threshold=85):

with open(schema_path, 'r') as f:

schema = json.load(f)

df = pd.read_excel(excel_path, sheet_name=sheet_name)

if field_column not in df.columns:

raise ValueError(f"'{field_column}' column not found in Excel.")

source_fields = df[field_column].dropna().astype(str).tolist()

schema_fields = flatten_json_schema(schema)

result_df = match_fields(source_fields, schema_fields, threshold=threshold)

return result_df

# ---- To run the script standalone ----

if __name__ == "__main__":

# Update paths here

schema_file = 'Output_Schema.json'

excel_file = 'Source_Metadata.xlsx'

result = validate_fields_against_schema(schema_file, excel_file)

print(result)

result.to_excel('Field_Match_Report.xlsx', index=False)

JsonSchemato Excel with Fuzzy Source

import json

import pandas as pd

from rapidfuzz import process, fuzz

def flatten_json_schema(schema, parent_key=''):

"""Flatten nested JSON schema properties using dot notation"""

items = {}

properties = schema.get('properties', {})

for k, v in properties.items():

full_key = f"{parent_key}.{k}" if parent_key else k

if v.get('type') == 'object' and 'properties' in v:

items.update(flatten_json_schema(v, full_key))

else:

items[full_key] = v

return items

def find_best_match(field_name, candidates, threshold=80):

"""Return best fuzzy match from candidates if above threshold"""

if not candidates:

return None, 0

match, score, _ = process.extractOne(field_name, candidates, scorer=fuzz.ratio)

if score >= threshold:

return match, score

return None, 0

def json_schema_to_excel_with_fuzzy_source(schema_path, source_excel_path, output_excel_path):

# Load JSON Schema

with open(schema_path, 'r') as f:

schema = json.load(f)

# Flatten nested schema

flat_props = flatten_json_schema(schema)

# Load source Excel metadata, strip columns and strings

source_df = pd.read_excel(source_excel_path)

source_df.columns = source_df.columns.str.strip()

source_df = source_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Prepare lists for matching

source_fields = source_df['Field Name'].dropna().unique().tolist() if 'Field Name' in source_df.columns else []

source_titles = source_df['Title'].dropna().unique().tolist() if 'Title' in source_df.columns else []

source_aliases = source_df['Alias'].dropna().unique().tolist() if 'Alias' in source_df.columns else []

rows = []

required_fields = schema.get('required', [])

for json_field, definition in flat_props.items():

# First try exact match on Field Name

matched_row = None

if json_field in source_fields:

matched_row = source_df[source_df['Field Name'] == json_field].iloc[0]

else:

# Try fuzzy match on Field Name

best_field, score = find_best_match(json_field, source_fields)

if best_field:

matched_row = source_df[source_df['Field Name'] == best_field].iloc[0]

else:

# Try exact or fuzzy on Title

best_title, score = find_best_match(json_field, source_titles)

if best_title:

matched_row = source_df[source_df['Title'] == best_title].iloc[0]

else:

# Try exact or fuzzy on Alias

best_alias, score = find_best_match(json_field, source_aliases)

if best_alias:

matched_row = source_df[source_df['Alias'] == best_alias].iloc[0]

# Prepare row with source metadata or empty if no match

row = {

"Source File Name": matched_row['Source File Name'] if matched_row is not None and 'Source File Name' in matched_row else '',

"Source Field Name": matched_row['Field Name'] if matched_row is not None and 'Field Name' in matched_row else '',

"Source Data Type": matched_row['Source Data Type'] if matched_row is not None and 'Source Data Type' in matched_row else '',

"Source Scale": matched_row['Source Scale'] if matched_row is not None and 'Source Scale' in matched_row else '',

"Source PK": matched_row['Source PK'] if matched_row is not None and 'Source PK' in matched_row else '',

"Source Nullability": matched_row['Source Nullability'] if matched_row is not None and 'Source Nullability' in matched_row else '',

"Source Data Sample": matched_row['Source Data Sample'] if matched_row is not None and 'Source Data Sample' in matched_row else '',

# JSON Schema metadata

"Field Name": json_field,

"Title": definition.get("title", ""),

"Description": definition.get("description", ""),

"Type": definition.get("type", ""),

"MaxLength": definition.get("maxLength", ""),

"Enum": ", ".join(definition.get("enum", [])) if definition.get("enum") else "",

"Required": json_field in required_fields

}

rows.append(row)

# Export to Excel

df_out = pd.DataFrame(rows)

df_out.to_excel(output_excel_path, index=False)

print(f"Metadata Excel generated with fuzzy matching at: {output_excel_path}")

# Example usage

# json_schema_to_excel_with_fuzzy_source('Output_Schema.json', 'source_metadata.xlsx', 'merged_metadata.xlsx')

Excel to JSON Schema with Primary Key Indicator

import pandas as pd

import json

from collections import defaultdict

def excel_to_json_schema_with_primary_key(excel_path, output_schema_path="Output_Schema.json"):

df = pd.read_excel(excel_path)

# Fill NaNs

df = df.fillna("")

# Normalize keys

def normalize(s):

return s.strip() if isinstance(s, str) else s

df.columns = [normalize(col) for col in df.columns]

root = {

"type": "object",

"properties": {},

"required": []

}

nested_objects = defaultdict(dict)

for _, row in df.iterrows():

column_name = row.get("Column Name", "").strip()

if not column_name:

continue

parts = column_name.split(".")

group = row.get("Group", "").strip()

current = root

props = current["properties"]

# Traverse/create nested groups

for i, part in enumerate(parts):

if i == len(parts) - 1:

# Final leaf

schema_prop = {

"type": row.get("Type", "").strip().lower()

}

if row.get("Title"):

schema_prop["title"] = row["Title"]

if row.get("Description"):

schema_prop["description"] = row["Description"]

if row.get("MaxLength") and schema_prop["type"] == "string":

schema_prop["maxLength"] = int(row["MaxLength"])

if row.get("Enum"):

schema_prop["enum"] = [e.strip() for e in str(row["Enum"]).split(",")]

if str(row.get("Primary Key", "")).strip().lower() == "yes":

schema_prop["primaryKey"] = True

# Add required field

if str(row.get("Required", "")).strip().lower() == "yes":

if "required" not in current:

current["required"] = []

current["required"].append(part)

props[part] = schema_prop

else:

# Intermediate nested object

if part not in props:

props[part] = {

"type": "object",

"properties": {}

}

current = props[part]

props = current["properties"]

with open(output_schema_path, "w") as f:

json.dump(root, f, indent=2)

print(f"✅ JSON Schema written to {output_schema_path}")

if __name__ == "__main__":

excel_to_json_schema_with_primary_key("schema_metadata.xlsx")

JSON Schema to Excel Metadata with PK infor as well

import json

import pandas as pd

def flatten_schema(schema, group="", parent_key=""):

"""Recursively flatten the schema to extract column metadata."""

rows = []

properties = schema.get("properties", {})

required = schema.get("required", [])

for prop, details in properties.items():

full_key = f"{parent_key}.{prop}" if parent_key else prop

entry = {

"Column Name": full_key,

"Type": details.get("type", ""),

"Title": details.get("title", ""),

"Description": details.get("description", ""),

"MaxLength": details.get("maxLength", ""),

"Enum": ", ".join(details.get("enum", [])) if "enum" in details else "",

"Group": group,

"Primary Key": "Yes" if details.get("primaryKey", False) else "No"

}

if details.get("type") == "object":

nested_group = full_key

rows += flatten_schema(details, group=nested_group, parent_key=full_key)

elif details.get("type") == "array" and "items" in details and details["items"].get("type") == "object":

nested_group = full_key

rows += flatten_schema(details["items"], group=nested_group, parent_key=full_key)

else:

rows.append(entry)

return rows

def json_schema_to_excel(schema_path, output_excel="schema_metadata.xlsx"):

with open(schema_path, "r") as f:

schema = json.load(f)

rows = flatten_schema(schema)

df = pd.DataFrame(rows)

df.to_excel(output_excel, index=False)

print(f"✅ Metadata written to Excel: {output_excel}")

if __name__ == "__main__":

json_schema_to_excel("Output_Schema.json")