Wednesday, September 3, 2025

Fund

 Umbrella Fund (Global Opportunities Fund Complex)

 ├── Sub-Fund A: Equity Fund (U.S. & Offshore Master-Feeder)

 │ ├── Feeder A1: U.S. Onshore Feeder (Delaware LP)

 │ ├── Feeder A2: Offshore Feeder (Cayman Company)

 │ └── Master Fund A: Cayman Master Fund

 │ └── Portfolio of Securities (Equities, Options, etc.)

 │

 ├── Sub-Fund B: Bond Fund (Standalone)

 │ ├── Share Class B1: USD Institutional

 │ ├── Share Class B2: EUR Institutional

 │ └── Share Class B3: USD Retail

 │

 └── Sub-Fund C: Multi-Asset Fund (Umbrella with Share Classes only)

       ├── Share Class C1: Accumulating, USD

       ├── Share Class C2: Distributing, EUR

       └── Share Class C3: Distributing, GBP


Friday, June 20, 2025

CICD

 stages:

  - prepare

  - generate_json

  - test_json

  - data_dictionary

  - metadata

  - finalize


default:

  image: python:3.11

  before_script:

    - pip install -r requirements.txt

    - mkdir -p artifacts


variables:

  EXCEL_INPUT: "data/source_metadata.xlsx"

  SAMPLE_JSON: "data/sample_data.json"

  OUTPUT_SCHEMA: "artifacts/generated_schema.json"

  DATA_DICTIONARY: "artifacts/data_dictionary.xlsx"

  STTM_OUTPUT: "artifacts/sttm.xlsx"

  CODE_COLUMNS_OUTPUT: "artifacts/code_columns.xlsx"


# Step 1: Convert Excel to JSON Schema

generate_schema:

  stage: generate_json

  script:

    - python scripts/excel_to_json_schema.py $EXCEL_INPUT $OUTPUT_SCHEMA

  artifacts:

    paths:

      - $OUTPUT_SCHEMA


# Step 2: Validate sample data against generated JSON Schema

test_schema_with_data:

  stage: test_json

  script:

    - python scripts/validate_json_sample.py $OUTPUT_SCHEMA $SAMPLE_JSON

  dependencies:

    - generate_schema


# Step 3: Generate Data Dictionary from JSON Schema

generate_data_dictionary:

  stage: data_dictionary

  script:

    - python scripts/json_to_data_dictionary.py $OUTPUT_SCHEMA $DATA_DICTIONARY

  dependencies:

    - generate_schema

  artifacts:

    paths:

      - $DATA_DICTIONARY


# Step 4: Generate STTM from Data Dictionary

generate_sttm:

  stage: metadata

  script:

    - python scripts/data_dictionary_to_sttm.py $DATA_DICTIONARY $STTM_OUTPUT

  dependencies:

    - generate_data_dictionary

  artifacts:

    paths:

      - $STTM_OUTPUT


# Step 5: Identify code-value columns

identify_code_columns:

  stage: finalize

  script:

    - python scripts/identify_code_columns.py $DATA_DICTIONARY $CODE_COLUMNS_OUTPUT

  dependencies:

    - generate_data_dictionary

  artifacts:

    paths:

      - $CODE_COLUMNS_OUTPUT


Sunday, June 15, 2025

CompareJsonSchemas

 import json

from openpyxl import Workbook

from openpyxl.styles import Font



def load_json_schema(file_path):

    with open(file_path, 'r') as f:

        return json.load(f)



def flatten_schema_properties(schema, path=''):

    props = {}

    if 'properties' in schema:

        for key, value in schema['properties'].items():

            full_path = f"{path}.{key}" if path else key

            props[full_path] = {

                'type': value.get('type'),

                'enum': value.get('enum'),

                'format': value.get('format'),

                'maxLength': value.get('maxLength'),

                'minimum': value.get('minimum'),

                'maximum': value.get('maximum'),

            }

            if value.get('type') == 'object':

                nested = flatten_schema_properties(value, full_path)

                props.update(nested)

    return props



def compare_required_fields(schema1, schema2):

    req1 = set(schema1.get('required', []))

    req2 = set(schema2.get('required', []))


    added = req2 - req1

    removed = req1 - req2


    differences = []

    for field in added:

        differences.append((field, 'REQUIRED_ADDED', '', 'required'))

    for field in removed:

        differences.append((field, 'REQUIRED_REMOVED', 'required', ''))

    return differences



def compare_flattened_properties(flat1, flat2):

    all_keys = set(flat1.keys()) | set(flat2.keys())

    differences = []


    for key in all_keys:

        val1 = flat1.get(key)

        val2 = flat2.get(key)


        if key not in flat1:

            differences.append((key, 'ADDED', '', json.dumps(val2)))

        elif key not in flat2:

            differences.append((key, 'REMOVED', json.dumps(val1), ''))

        elif val1 != val2:

            differences.append((key, 'MODIFIED', json.dumps(val1), json.dumps(val2)))


    return differences



def write_differences_to_excel(differences, output_path):

    wb = Workbook()

    ws = wb.active

    ws.title = "Schema Differences"


    headers = ["Path", "Change Type", "Schema 1", "Schema 2"]

    ws.append(headers)


    for cell in ws[1]:

        cell.font = Font(bold=True)


    for diff in differences:

        ws.append(diff)


    wb.save(output_path)

    print(f"✅ Differences written to {output_path}")



if __name__ == "__main__":

    schema1_path = "schema1.json"

    schema2_path = "schema2.json"

    output_excel = "schema_differences.xlsx"


    schema1 = load_json_schema(schema1_path)

    schema2 = load_json_schema(schema2_path)


    flat1 = flatten_schema_properties(schema1)

    flat2 = flatten_schema_properties(schema2)


    prop_diffs = compare_flattened_properties(flat1, flat2)

    req_diffs = compare_required_fields(schema1, schema2)


    all_diffs = prop_diffs + req_diffs

    write_differences_to_excel(all_diffs, output_excel)


Saturday, May 31, 2025

Field match report with JSON schema

 import json

import pandas as pd

from rapidfuzz import process, fuzz


def flatten_json_schema(schema, parent_key=''):

    fields = []

    for key, value in schema.get('properties', {}).items():

        full_key = f"{parent_key}.{key}" if parent_key else key

        if value.get('type') == 'object' and 'properties' in value:

            fields.extend(flatten_json_schema(value, full_key))

        else:

            fields.append(full_key)

    return fields


def match_fields(source_fields, schema_fields, threshold=85):

    match_results = []

    for field in source_fields:

        if field in schema_fields:

            match_results.append((field, field, 100, 'Exact'))

        else:

            best_match, score, _ = process.extractOne(field, schema_fields, scorer=fuzz.ratio)

            if score >= threshold:

                match_results.append((field, best_match, score, 'Fuzzy'))

            else:

                match_results.append((field, '', score, 'Unmatched'))

    return pd.DataFrame(match_results, columns=['Source Field', 'Matched Field', 'Match Score', 'Match Type'])


# --- Example usage ---

def validate_fields_against_schema(schema_path, excel_path, sheet_name=None, field_column='Field Name', threshold=85):

    with open(schema_path, 'r') as f:

        schema = json.load(f)


    df = pd.read_excel(excel_path, sheet_name=sheet_name)

    if field_column not in df.columns:

        raise ValueError(f"'{field_column}' column not found in Excel.")


    source_fields = df[field_column].dropna().astype(str).tolist()

    schema_fields = flatten_json_schema(schema)


    result_df = match_fields(source_fields, schema_fields, threshold=threshold)

    return result_df


# ---- To run the script standalone ----

if __name__ == "__main__":

    # Update paths here

    schema_file = 'Output_Schema.json'

    excel_file = 'Source_Metadata.xlsx'


    result = validate_fields_against_schema(schema_file, excel_file)

    print(result)

    result.to_excel('Field_Match_Report.xlsx', index=False)


JsonSchemato Excel with Fuzzy Source

 import json

import pandas as pd

from rapidfuzz import process, fuzz


def flatten_json_schema(schema, parent_key=''):

    """Flatten nested JSON schema properties using dot notation"""

    items = {}

    properties = schema.get('properties', {})

    for k, v in properties.items():

        full_key = f"{parent_key}.{k}" if parent_key else k

        if v.get('type') == 'object' and 'properties' in v:

            items.update(flatten_json_schema(v, full_key))

        else:

            items[full_key] = v

    return items


def find_best_match(field_name, candidates, threshold=80):

    """Return best fuzzy match from candidates if above threshold"""

    if not candidates:

        return None, 0

    match, score, _ = process.extractOne(field_name, candidates, scorer=fuzz.ratio)

    if score >= threshold:

        return match, score

    return None, 0


def json_schema_to_excel_with_fuzzy_source(schema_path, source_excel_path, output_excel_path):

    # Load JSON Schema

    with open(schema_path, 'r') as f:

        schema = json.load(f)

    

    # Flatten nested schema

    flat_props = flatten_json_schema(schema)

    

    # Load source Excel metadata, strip columns and strings

    source_df = pd.read_excel(source_excel_path)

    source_df.columns = source_df.columns.str.strip()

    source_df = source_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    

    # Prepare lists for matching

    source_fields = source_df['Field Name'].dropna().unique().tolist() if 'Field Name' in source_df.columns else []

    source_titles = source_df['Title'].dropna().unique().tolist() if 'Title' in source_df.columns else []

    source_aliases = source_df['Alias'].dropna().unique().tolist() if 'Alias' in source_df.columns else []

    

    rows = []

    required_fields = schema.get('required', [])

    

    for json_field, definition in flat_props.items():

        # First try exact match on Field Name

        matched_row = None

        if json_field in source_fields:

            matched_row = source_df[source_df['Field Name'] == json_field].iloc[0]

        else:

            # Try fuzzy match on Field Name

            best_field, score = find_best_match(json_field, source_fields)

            if best_field:

                matched_row = source_df[source_df['Field Name'] == best_field].iloc[0]

            else:

                # Try exact or fuzzy on Title

                best_title, score = find_best_match(json_field, source_titles)

                if best_title:

                    matched_row = source_df[source_df['Title'] == best_title].iloc[0]

                else:

                    # Try exact or fuzzy on Alias

                    best_alias, score = find_best_match(json_field, source_aliases)

                    if best_alias:

                        matched_row = source_df[source_df['Alias'] == best_alias].iloc[0]

        

        # Prepare row with source metadata or empty if no match

        row = {

            "Source File Name": matched_row['Source File Name'] if matched_row is not None and 'Source File Name' in matched_row else '',

            "Source Field Name": matched_row['Field Name'] if matched_row is not None and 'Field Name' in matched_row else '',

            "Source Data Type": matched_row['Source Data Type'] if matched_row is not None and 'Source Data Type' in matched_row else '',

            "Source Scale": matched_row['Source Scale'] if matched_row is not None and 'Source Scale' in matched_row else '',

            "Source PK": matched_row['Source PK'] if matched_row is not None and 'Source PK' in matched_row else '',

            "Source Nullability": matched_row['Source Nullability'] if matched_row is not None and 'Source Nullability' in matched_row else '',

            "Source Data Sample": matched_row['Source Data Sample'] if matched_row is not None and 'Source Data Sample' in matched_row else '',

            # JSON Schema metadata

            "Field Name": json_field,

            "Title": definition.get("title", ""),

            "Description": definition.get("description", ""),

            "Type": definition.get("type", ""),

            "MaxLength": definition.get("maxLength", ""),

            "Enum": ", ".join(definition.get("enum", [])) if definition.get("enum") else "",

            "Required": json_field in required_fields

        }

        rows.append(row)

    

    # Export to Excel

    df_out = pd.DataFrame(rows)

    df_out.to_excel(output_excel_path, index=False)

    print(f"Metadata Excel generated with fuzzy matching at: {output_excel_path}")


# Example usage

# json_schema_to_excel_with_fuzzy_source('Output_Schema.json', 'source_metadata.xlsx', 'merged_metadata.xlsx')


Excel to JSON Schema with Primary Key Indicator

 import pandas as pd

import json

from collections import defaultdict


def excel_to_json_schema_with_primary_key(excel_path, output_schema_path="Output_Schema.json"):

    df = pd.read_excel(excel_path)


    # Fill NaNs

    df = df.fillna("")


    # Normalize keys

    def normalize(s):

        return s.strip() if isinstance(s, str) else s


    df.columns = [normalize(col) for col in df.columns]


    root = {

        "type": "object",

        "properties": {},

        "required": []

    }


    nested_objects = defaultdict(dict)


    for _, row in df.iterrows():

        column_name = row.get("Column Name", "").strip()

        if not column_name:

            continue


        parts = column_name.split(".")

        group = row.get("Group", "").strip()

        current = root

        props = current["properties"]


        # Traverse/create nested groups

        for i, part in enumerate(parts):

            if i == len(parts) - 1:

                # Final leaf

                schema_prop = {

                    "type": row.get("Type", "").strip().lower()

                }


                if row.get("Title"):

                    schema_prop["title"] = row["Title"]


                if row.get("Description"):

                    schema_prop["description"] = row["Description"]


                if row.get("MaxLength") and schema_prop["type"] == "string":

                    schema_prop["maxLength"] = int(row["MaxLength"])


                if row.get("Enum"):

                    schema_prop["enum"] = [e.strip() for e in str(row["Enum"]).split(",")]


                if str(row.get("Primary Key", "")).strip().lower() == "yes":

                    schema_prop["primaryKey"] = True


                # Add required field

                if str(row.get("Required", "")).strip().lower() == "yes":

                    if "required" not in current:

                        current["required"] = []

                    current["required"].append(part)


                props[part] = schema_prop


            else:

                # Intermediate nested object

                if part not in props:

                    props[part] = {

                        "type": "object",

                        "properties": {}

                    }

                current = props[part]

                props = current["properties"]


    with open(output_schema_path, "w") as f:

        json.dump(root, f, indent=2)


    print(f"✅ JSON Schema written to {output_schema_path}")


if __name__ == "__main__":

    excel_to_json_schema_with_primary_key("schema_metadata.xlsx")


JSON Schema to Excel Metadata with PK infor as well

 import json

import pandas as pd


def flatten_schema(schema, group="", parent_key=""):

    """Recursively flatten the schema to extract column metadata."""

    rows = []

    properties = schema.get("properties", {})

    required = schema.get("required", [])


    for prop, details in properties.items():

        full_key = f"{parent_key}.{prop}" if parent_key else prop

        entry = {

            "Column Name": full_key,

            "Type": details.get("type", ""),

            "Title": details.get("title", ""),

            "Description": details.get("description", ""),

            "MaxLength": details.get("maxLength", ""),

            "Enum": ", ".join(details.get("enum", [])) if "enum" in details else "",

            "Group": group,

            "Primary Key": "Yes" if details.get("primaryKey", False) else "No"

        }


        if details.get("type") == "object":

            nested_group = full_key

            rows += flatten_schema(details, group=nested_group, parent_key=full_key)

        elif details.get("type") == "array" and "items" in details and details["items"].get("type") == "object":

            nested_group = full_key

            rows += flatten_schema(details["items"], group=nested_group, parent_key=full_key)

        else:

            rows.append(entry)


    return rows


def json_schema_to_excel(schema_path, output_excel="schema_metadata.xlsx"):

    with open(schema_path, "r") as f:

        schema = json.load(f)


    rows = flatten_schema(schema)

    df = pd.DataFrame(rows)

    df.to_excel(output_excel, index=False)

    print(f"✅ Metadata written to Excel: {output_excel}")


if __name__ == "__main__":

    json_schema_to_excel("Output_Schema.json")