|
2 | 2 | """ |
3 | 3 | Metrics Analysis Example - Detailed reproduction quality analysis. |
4 | 4 |
|
| 5 | +Uses the standardized ReproductionMetrics API. |
| 6 | +
|
5 | 7 | Usage: |
6 | 8 | python 06_metrics.py tests/samples/sample_dataclasses.py |
7 | | - python 06_metrics.py code2logic/models.py --compare-formats |
8 | | - python 06_metrics.py tests/samples/ --batch |
| 9 | + python 06_metrics.py tests/samples/sample_class.py --verbose |
9 | 10 | """ |
10 | 11 |
|
11 | 12 | import sys |
12 | 13 | import argparse |
13 | | -import json |
14 | 14 | from pathlib import Path |
15 | 15 |
|
16 | 16 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
23 | 23 |
|
24 | 24 | from code2logic import ( |
25 | 25 | ReproductionMetrics, |
26 | | - analyze_reproduction, |
27 | | - compare_formats, |
28 | | - generate_file_gherkin, |
29 | | - generate_file_yaml, |
30 | | - generate_file_json, |
31 | | - reproduce_file, |
32 | 26 | get_client, |
| 27 | + generate_file_gherkin, |
33 | 28 | ) |
34 | 29 | from code2logic.reproduction import extract_code_block |
35 | 30 |
|
36 | 31 |
|
37 | | -def generate_from_template(spec: str) -> str: |
38 | | - """Simple template-based code generation as fallback.""" |
39 | | - # Extract class names from spec |
40 | | - import re |
41 | | - |
42 | | - # Look for actual class names in the spec |
43 | | - classes = [] |
44 | | - |
45 | | - # Pattern 1: Look for class declarations in examples |
46 | | - class_matches = re.findall(r'class (\w+)', spec, re.MULTILINE) |
47 | | - classes.extend(class_matches) |
48 | | - |
49 | | - # Pattern 2: Look for dataclass scenarios |
50 | | - dataclass_matches = re.findall(r'Scenario: (\w+)', spec, re.MULTILINE) |
51 | | - classes.extend(dataclass_matches) |
52 | | - |
53 | | - # Pattern 3: Look for feature names (often class names) |
54 | | - feature_matches = re.findall(r'Feature: (\w+)', spec, re.MULTILINE) |
55 | | - classes.extend(feature_matches) |
56 | | - |
57 | | - # Filter valid class names |
58 | | - valid_classes = [] |
59 | | - for c in classes: |
60 | | - if (c.isidentifier() |
61 | | - and c not in ['Given', 'When', 'Then', 'And', 'Background', 'Scenario', 'Feature', 'Core'] |
62 | | - and not c.lower() in ['test', 'example', 'sample', 'dataclass', 'define'] |
63 | | - and len(c) > 1): # Skip single letters |
64 | | - valid_classes.append(c) |
65 | | - |
66 | | - # Remove duplicates and limit |
67 | | - valid_classes = list(set(valid_classes))[:5] |
68 | | - |
69 | | - # Generate code |
70 | | - code = '''from dataclasses import dataclass, field |
71 | | -from typing import Optional, List, Dict |
72 | | -from datetime import datetime |
73 | | -
|
74 | | -''' |
75 | | - |
76 | | - if valid_classes: |
77 | | - for class_name in valid_classes: |
78 | | - code += f'''@dataclass |
79 | | -class {class_name}: |
80 | | - """Generated from specification.""" |
81 | | - # TODO: Add fields based on original code |
82 | | - |
83 | | -''' |
84 | | - else: |
85 | | - # Fallback: generate a generic class |
86 | | - code += '''@dataclass |
87 | | -class GeneratedClass: |
88 | | - """Generated class from specification.""" |
89 | | - name: str = "" |
90 | | - description: str = "" |
91 | | - |
92 | | -''' |
93 | | - |
94 | | - return code |
95 | | - |
96 | | - |
97 | | -def analyze_single(source_path: str, verbose: bool = False, no_llm: bool = False): |
| 32 | +def analyze_file(source_path: str, verbose: bool = False, no_llm: bool = False): |
98 | 33 | """Analyze single file with detailed metrics.""" |
99 | 34 | path = Path(source_path) |
100 | 35 | original = path.read_text() |
101 | 36 |
|
102 | | - print(f"\nAnalyzing: {source_path}") |
103 | | - print("="*60) |
| 37 | + print(f"\n{'='*60}") |
| 38 | + print(f"METRICS ANALYSIS: {source_path}") |
| 39 | + print(f"{'='*60}") |
104 | 40 |
|
105 | | - # Generate spec and reproduce |
| 41 | + # Generate spec |
106 | 42 | spec = generate_file_gherkin(source_path) |
107 | | - print(f" Generated Gherkin spec: {len(spec)} chars") |
| 43 | + print(f"Spec size: {len(spec)} chars ({len(spec)//4} tokens)") |
108 | 44 |
|
| 45 | + # Reproduce |
109 | 46 | if no_llm: |
110 | | - print("\nUsing template-based generation (--no-llm flag)...") |
111 | | - generated = generate_from_template(spec) |
| 47 | + generated = _template_generate(spec) |
| 48 | + print("Using template generation (--no-llm)") |
112 | 49 | else: |
113 | 50 | try: |
114 | 51 | client = get_client() |
115 | | - prompt = f"""Generate Python code from this Gherkin specification: |
116 | | -
|
117 | | -{spec} |
118 | | -
|
119 | | -Generate complete, working Python code.""" |
120 | | - |
| 52 | + prompt = f"Generate Python code from this Gherkin spec:\n\n{spec}\n\nOutput only code." |
121 | 53 | response = client.generate(prompt, max_tokens=4000) |
122 | 54 | generated = extract_code_block(response) |
| 55 | + print(f"Generated: {len(generated)} chars") |
123 | 56 | except Exception as e: |
124 | | - print(f"\n⚠️ LLM generation failed: {e}") |
125 | | - print("Using template-based generation instead...") |
126 | | - # Fallback to template generation |
127 | | - generated = generate_from_template(spec) |
128 | | - |
129 | | - print(f" Generated code: {len(generated)} chars") |
| 57 | + print(f"LLM failed: {e}, using template") |
| 58 | + generated = _template_generate(spec) |
130 | 59 |
|
131 | | - # Analyze with metrics |
| 60 | + # Analyze |
132 | 61 | metrics = ReproductionMetrics(verbose=verbose) |
133 | | - result = metrics.analyze( |
134 | | - original, generated, spec, |
135 | | - format_name='gherkin', |
136 | | - source_file=source_path, |
137 | | - ) |
| 62 | + result = metrics.analyze(original, generated, spec, format_name='gherkin', source_file=source_path) |
138 | 63 |
|
139 | 64 | # Print results |
140 | | - print(f"\n📊 Overall Score: {result.overall_score:.1f}% (Grade: {result.quality_grade})") |
| 65 | + print(f"\n📊 Overall: {result.overall_score:.1f}% ({result.quality_grade})") |
141 | 66 |
|
142 | 67 | print(f"\n📝 Text Metrics:") |
143 | | - print(f" Character Similarity: {result.text.char_similarity:.1f}%") |
144 | | - print(f" Line Similarity: {result.text.line_similarity:.1f}%") |
145 | | - print(f" Word Similarity: {result.text.word_similarity:.1f}%") |
146 | | - print(f" Jaccard Similarity: {result.text.jaccard_similarity:.1f}%") |
147 | | - print(f" Cosine Similarity: {result.text.cosine_similarity:.1f}%") |
148 | | - print(f" Diff Changes: {result.text.diff_changed} lines") |
| 68 | + print(f" Cosine Similarity: {result.text.cosine_similarity:.1f}%") |
| 69 | + print(f" Jaccard Similarity: {result.text.jaccard_similarity:.1f}%") |
149 | 70 |
|
150 | | - print(f"\n🏗️ Structural Metrics:") |
151 | | - print(f" Classes: {result.structural.classes_original} → {result.structural.classes_generated} {'✓' if result.structural.classes_match else '✗'}") |
152 | | - print(f" Functions: {result.structural.functions_original} → {result.structural.functions_generated} {'✓' if result.structural.functions_match else '✗'}") |
153 | | - print(f" Methods: {result.structural.methods_original} → {result.structural.methods_generated} {'✓' if result.structural.methods_match else '✗'}") |
154 | | - print(f" Imports: {result.structural.imports_original} → {result.structural.imports_generated} {'✓' if result.structural.imports_match else '✗'}") |
155 | | - print(f" Score: {result.structural.structural_score:.1f}%") |
| 71 | + print(f"\n🏗️ Structural:") |
| 72 | + print(f" Classes: {result.structural.classes_original} → {result.structural.classes_generated}") |
| 73 | + print(f" Functions: {result.structural.functions_original} → {result.structural.functions_generated}") |
| 74 | + print(f" Score: {result.structural.structural_score:.1f}%") |
156 | 75 |
|
157 | | - print(f"\n🎯 Semantic Metrics:") |
158 | | - print(f" Naming Similarity: {result.semantic.naming_similarity:.1f}%") |
159 | | - print(f" Type Hints: {result.semantic.type_hints_present:.1f}%") |
160 | | - print(f" Docstrings: {result.semantic.docstring_present:.1f}%") |
161 | | - print(f" Signatures: {result.semantic.signature_match:.1f}%") |
162 | | - print(f" Intent Score: {result.semantic.intent_score:.1f}%") |
| 76 | + print(f"\n🎯 Semantic:") |
| 77 | + print(f" Naming: {result.semantic.naming_similarity:.1f}%") |
| 78 | + print(f" Intent: {result.semantic.intent_score:.1f}%") |
163 | 79 |
|
164 | | - print(f"\n📦 Format Efficiency:") |
165 | | - print(f" Spec Size: {result.format.spec_chars} chars ({result.format.spec_tokens} tokens)") |
166 | | - print(f" Compression: {result.format.compression_ratio:.2f}x") |
167 | | - print(f" Efficiency: {result.format.efficiency_score:.2f}") |
| 80 | + print(f"\n📦 Efficiency:") |
| 81 | + print(f" Compression: {result.format.compression_ratio:.2f}x") |
168 | 82 |
|
169 | | - print(f"\n💡 Recommendations:") |
170 | | - for rec in result.recommendations: |
171 | | - print(f" • {rec}") |
| 83 | + if result.recommendations: |
| 84 | + print(f"\n💡 Recommendations:") |
| 85 | + for rec in result.recommendations[:3]: |
| 86 | + print(f" • {rec}") |
172 | 87 |
|
173 | 88 | return result |
174 | 89 |
|
175 | 90 |
|
176 | | -def compare_all_formats(source_path: str, no_llm: bool = False): |
177 | | - """Compare reproduction across all formats.""" |
178 | | - path = Path(source_path) |
179 | | - original = path.read_text() |
180 | | - |
181 | | - print(f"\nComparing formats for: {source_path}") |
182 | | - print("="*60) |
183 | | - |
184 | | - # Generate specs |
185 | | - formats = {} |
186 | | - for fmt, generator in [ |
187 | | - ('gherkin', generate_file_gherkin), |
188 | | - ('yaml', generate_file_yaml), |
189 | | - ('json', generate_file_json), |
190 | | - ]: |
191 | | - print(f" Generating {fmt} spec...", end=" ", flush=True) |
192 | | - spec = generator(source_path) |
193 | | - formats[fmt] = spec |
194 | | - print(f"✓ ({len(spec)} chars)") |
195 | | - |
196 | | - results = {} |
197 | | - |
198 | | - for fmt, spec in formats.items(): |
199 | | - print(f"\n Testing {fmt}...", end=" ", flush=True) |
200 | | - |
201 | | - if no_llm: |
202 | | - generated = generate_from_template(spec) |
203 | | - results[fmt] = (spec, generated) |
204 | | - print(f"✓ (template, {len(generated)} chars)") |
205 | | - else: |
206 | | - try: |
207 | | - client = get_client() |
208 | | - prompt = f"""Generate Python code from this {fmt} specification: |
209 | | -
|
210 | | -{spec[:4000]} |
211 | | -
|
212 | | -Generate complete, working Python code.""" |
213 | | - |
214 | | - response = client.generate(prompt, max_tokens=4000) |
215 | | - generated = extract_code_block(response) |
216 | | - results[fmt] = (spec, generated) |
217 | | - print(f"✓ ({len(generated)} chars)") |
218 | | - except Exception as e: |
219 | | - print(f"✗ ({e})") |
220 | | - # Fallback to template |
221 | | - generated = generate_from_template(spec) |
222 | | - results[fmt] = (spec, generated) |
223 | | - |
224 | | - # Compare |
225 | | - comparison = compare_formats(original, results) |
226 | | - |
227 | | - # Print comparison |
228 | | - print(f"\n📊 Format Comparison:") |
229 | | - print("-"*60) |
230 | | - print(f"{'Format':<12} {'Overall':>10} {'Grade':>6} {'Text':>10} {'Struct':>10} {'Semantic':>10}") |
231 | | - print("-"*60) |
232 | | - |
233 | | - for fmt, summary in comparison['summary'].items(): |
234 | | - print(f"{fmt:<12} {summary['overall']:>9.1f}% {summary['grade']:>6} " |
235 | | - f"{summary['text']:>9.1f}% {summary['structural']:>9.1f}% {summary['semantic']:>9.1f}%") |
236 | | - |
237 | | - print("-"*60) |
238 | | - print(f"\n🏆 Best Format by Category:") |
239 | | - for category, fmt in comparison['best'].items(): |
240 | | - print(f" {category}: {fmt}") |
241 | | - |
242 | | - return comparison |
243 | | - |
244 | | - |
245 | | -def batch_analyze(project_path: str, no_llm: bool = False): |
246 | | - """Analyze all Python files in a directory.""" |
247 | | - path = Path(project_path) |
248 | | - files = list(path.glob('*.py')) |
249 | | - |
250 | | - print(f"\nBatch Analysis: {project_path}") |
251 | | - print(f"Found {len(files)} Python files") |
252 | | - print("="*60) |
253 | | - |
254 | | - all_results = [] |
255 | | - |
256 | | - for file_path in files[:5]: # Limit to 5 for demo |
257 | | - try: |
258 | | - result = analyze_single(str(file_path), verbose=False, no_llm=no_llm) |
259 | | - all_results.append({ |
260 | | - 'file': file_path.name, |
261 | | - 'score': result.overall_score, |
262 | | - 'grade': result.quality_grade, |
263 | | - }) |
264 | | - except Exception as e: |
265 | | - print(f"Error analyzing {file_path.name}: {e}") |
| 91 | +def _template_generate(spec: str) -> str: |
| 92 | + """Simple template fallback.""" |
| 93 | + import re |
| 94 | + classes = list(set(re.findall(r'class (\w+)', spec)))[:3] |
266 | 95 |
|
267 | | - # Summary |
268 | | - if all_results: |
269 | | - avg_score = sum(r['score'] for r in all_results) / len(all_results) |
270 | | - print(f"\n📈 Batch Summary:") |
271 | | - print(f" Files analyzed: {len(all_results)}") |
272 | | - print(f" Average score: {avg_score:.1f}%") |
273 | | - |
274 | | - print(f"\n By file:") |
275 | | - for r in sorted(all_results, key=lambda x: -x['score']): |
276 | | - print(f" {r['file']}: {r['score']:.1f}% ({r['grade']})") |
| 96 | + code = "from dataclasses import dataclass\nfrom typing import Optional, List\n\n" |
| 97 | + for cls in classes: |
| 98 | + if cls.isidentifier() and cls not in ['Given', 'When', 'Then']: |
| 99 | + code += f"@dataclass\nclass {cls}:\n pass\n\n" |
| 100 | + return code |
277 | 101 |
|
278 | 102 |
|
279 | 103 | def main(): |
280 | 104 | parser = argparse.ArgumentParser(description='Reproduction metrics analysis') |
281 | 105 | parser.add_argument('source', nargs='?', default='tests/samples/sample_dataclasses.py') |
282 | | - parser.add_argument('--compare-formats', '-c', action='store_true') |
283 | | - parser.add_argument('--batch', '-b', action='store_true') |
284 | 106 | parser.add_argument('--verbose', '-v', action='store_true') |
285 | | - parser.add_argument('--output', '-o', default='examples/output/metrics_report.md', help='Save report to file') |
286 | | - parser.add_argument('--no-llm', action='store_true', help='Skip LLM generation, use templates only') |
| 107 | + parser.add_argument('--no-llm', action='store_true') |
287 | 108 | args = parser.parse_args() |
288 | 109 |
|
289 | | - print("="*60) |
290 | | - print("CODE2LOGIC - METRICS ANALYSIS") |
291 | | - print("="*60) |
292 | | - |
293 | | - if args.no_llm: |
294 | | - print("\n⚠️ Running in template-only mode (--no-llm)") |
295 | | - |
296 | | - if args.batch: |
297 | | - batch_analyze(args.source, args.no_llm) |
298 | | - elif args.compare_formats: |
299 | | - result = compare_all_formats(args.source, args.no_llm) |
300 | | - if args.output: |
301 | | - Path(args.output).write_text(json.dumps(result, indent=2)) |
302 | | - print(f"\nSaved to: {args.output}") |
303 | | - else: |
304 | | - result = analyze_single(args.source, args.verbose, args.no_llm) |
305 | | - if args.output: |
306 | | - Path(args.output).write_text(result.to_report()) |
307 | | - print(f"\nReport saved to: {args.output}") |
| 110 | + analyze_file(args.source, args.verbose, args.no_llm) |
308 | 111 |
|
309 | 112 |
|
310 | 113 | if __name__ == '__main__': |
|
0 commit comments