Skip to main content

Example Workflows

Example Workflows

Workflow 1: Quick Single-Model Evaluation

Use case: Fast evaluation during development

# 1. Create minimal config
cat > config/quick_test.yaml << 'EOF'
benchmark:
name: "quick-test"
version: "1.0.0"
runs: 1

evaluators:
gpt35:
provider: "openai"
model: "gpt-3.5-turbo"
temperature: 0.0
max_tokens: 500

metrics:
- name: "clarity"
version: "1.0"
evaluators: ["gpt35"]

inputs:
quiz_directory: "data/quizzes"
source_directory: "data/inputs"

outputs:
results_directory: "data/results"
EOF

# 2. Run benchmark
python main.py --config config/quick_test.yaml

# 3. View results
ls -lt data/results
cat data/results/<run-bundle>/summary.txt | tail -20

Workflow 2: Comprehensive Multi-Model Comparison

Use case: Research paper evaluation, model selection

# 1. Use comprehensive config
python main.py \
--config config/comprehensive_eval.yaml \
--output-prefix experiment_comparison_v1

# 2. Analyze evaluator agreement
python << 'EOF'
import json
import pandas as pd
from pathlib import Path

# Load results
results = json.load(open('data/results/experiment_comparison_v1/aggregated.json'))

# Extract evaluator comparisons
for metric_name, metric_data in results['aggregations'].items():
print(f"\n{metric_name}:")
for evaluator, stats in metric_data.items():
print(f" {evaluator}: {stats['mean']:.2f} ± {stats['std_dev']:.2f}")
EOF

# 3. Generate comparison report
python scripts/generate_comparison_report.py \
--input data/results/experiment_comparison_v1/aggregated.json \
--output reports/model_comparison.pdf

Workflow 3: Iterative Metric Development

Use case: Developing and refining a new custom metric

# 1. Create test config with new metric
cat > config/test_new_metric.yaml << 'EOF'
benchmark:
name: "metric-development"
version: "0.1.0"
runs: 3

evaluators:
gpt4:
provider: "openai"
model: "gpt-4"
temperature: 0.0
max_tokens: 500

metrics:
- name: "my_new_metric"
version: "0.1"
evaluators: ["gpt4"]
parameters:
custom_param: "value"

inputs:
quiz_directory: "data/quizzes/test_subset"
source_directory: "data/inputs"

outputs:
results_directory: "data/results"
EOF

# 2. Run initial test
python main.py --config config/test_new_metric.yaml --output-prefix dev_v1

# 3. Review results and identify issues
cat data/results/dev_v1/summary.txt

# 4. Refine metric implementation
# Edit src/metrics/my_new_metric.py

# 5. Re-run with new version
# Update version in config to "0.2"
python main.py --config config/test_new_metric.yaml --output-prefix dev_v2

# 6. Compare versions
python << 'EOF'
import json

v1 = json.load(open('data/results/dev_v1/aggregated.json'))
v2 = json.load(open('data/results/dev_v2/aggregated.json'))

print("Version comparison:")
print(f"v0.1 mean: {v1['aggregations']['my_new_metric']['gpt4']['mean']:.2f}")
print(f"v0.2 mean: {v2['aggregations']['my_new_metric']['gpt4']['mean']:.2f}")
print(f"v0.1 std: {v1['aggregations']['my_new_metric']['gpt4']['std_dev']:.2f}")
print(f"v0.2 std: {v2['aggregations']['my_new_metric']['gpt4']['std_dev']:.2f}")
EOF

Workflow 4: Large-Scale Production Evaluation

Use case: Evaluating production quiz generation system

# 1. Prepare environment
export BENCHMARK_ENV=production
source .env.production

# 2. Run production benchmark with full metrics
python main.py \
--config config/production_full_eval.yaml \
--output-prefix prod_eval_$(date +%Y%m%d) \
--env .env.production

# 3. Generate comprehensive reports
python scripts/generate_report.py \
--results data/results/prod_eval_$(date +%Y%m%d)/aggregated.json \
--format pdf \
--include-visualizations \
--output reports/production_evaluation_$(date +%Y%m%d).pdf

# 4. Upload results to storage
aws s3 cp \
data/results/ \
s3://quiz-benchmark-results/$(date +%Y/%m/%d)/ \
--recursive

# 5. Send notification
python scripts/send_notification.py \
--channel slack \
--message "Production benchmark completed" \
--attach reports/production_evaluation_$(date +%Y%m%d).pdf