stages:
- prepare
- generate_json
- test_json
- data_dictionary
- metadata
- finalize
default:
image: python:3.11
before_script:
- pip install -r requirements.txt
- mkdir -p artifacts
variables:
EXCEL_INPUT: "data/source_metadata.xlsx"
SAMPLE_JSON: "data/sample_data.json"
OUTPUT_SCHEMA: "artifacts/generated_schema.json"
DATA_DICTIONARY: "artifacts/data_dictionary.xlsx"
STTM_OUTPUT: "artifacts/sttm.xlsx"
CODE_COLUMNS_OUTPUT: "artifacts/code_columns.xlsx"
# Step 1: Convert Excel to JSON Schema
generate_schema:
stage: generate_json
script:
- python scripts/excel_to_json_schema.py $EXCEL_INPUT $OUTPUT_SCHEMA
artifacts:
paths:
- $OUTPUT_SCHEMA
# Step 2: Validate sample data against generated JSON Schema
test_schema_with_data:
stage: test_json
script:
- python scripts/validate_json_sample.py $OUTPUT_SCHEMA $SAMPLE_JSON
dependencies:
- generate_schema
# Step 3: Generate Data Dictionary from JSON Schema
generate_data_dictionary:
stage: data_dictionary
script:
- python scripts/json_to_data_dictionary.py $OUTPUT_SCHEMA $DATA_DICTIONARY
dependencies:
- generate_schema
artifacts:
paths:
- $DATA_DICTIONARY
# Step 4: Generate STTM from Data Dictionary
generate_sttm:
stage: metadata
script:
- python scripts/data_dictionary_to_sttm.py $DATA_DICTIONARY $STTM_OUTPUT
dependencies:
- generate_data_dictionary
artifacts:
paths:
- $STTM_OUTPUT
# Step 5: Identify code-value columns
identify_code_columns:
stage: finalize
script:
- python scripts/identify_code_columns.py $DATA_DICTIONARY $CODE_COLUMNS_OUTPUT
dependencies:
- generate_data_dictionary
artifacts:
paths:
- $CODE_COLUMNS_OUTPUT
No comments:
Post a Comment