type_coverage_py/main.py at main · lolpack/type_coverage_py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
import argparse
import concurrent.futures
import json
import os
import shutil
import sys
import tempfile
from typing import Any, Optional

from analyzer.coverage_calculator import calculate_overall_coverage
from analyzer.package_analyzer import extract_files, find_stub_package
from analyzer.report_generator import (
    archive_old_reports,
    generate_report,
    generate_report_html,
)
from analyzer.typeshed_checker import (
    check_typeshed,
    find_stub_files,
    merge_files_with_stubs,
)
from analyzer.historical_view_generator import generate_dates_manifest
from coverage_sources.get_pyright_stats import main as get_pyright_stats
from coverage_sources.typeshed_coverage import download_typeshed_csv

JSON_REPORT_FILE = "package_report.json"
HTML_REPORT_FILE = "index.html"
TOP_PYPI_PACKAGES = "top-pypi-packages-30-days.min.json"
STUB_PACKAGES = "stub_packages.json"

HISTORICAL_DATA_DIR = "historical_data"


def load_and_sort_top_packages(json_file: str) -> list[dict[str, Any]]:
    """Load the JSON file and sort it by download_count."""
    with open(json_file, "r") as f:
        data = json.load(f)

    sorted_rows = sorted(data["rows"], key=lambda x: x["download_count"], reverse=True)
    return sorted_rows


def separate_test_files(files: list[str]) -> list[str]:
    """Separate files into test files and non-test files."""
    non_test_files: list[str] = []
    for file in files:
        parts = [x.replace("_", "") for x in file.split("/")]
        if "test" in parts or "tests" in parts:
            continue
        non_test_files.append(file)
    return non_test_files


def analyze_package(
    package_name: str,
    rank: Optional[int] = None,
    download_count: Optional[int] = None,
    typeshed_data: Optional[dict[str, Any]] = None,
    has_stub_package: bool = False,
    parallel: bool = False,
) -> dict[str, Any]:
    """Analyze a single package and generate a report."""
    package_report: dict[str, Any] = {
        "DownloadCount": download_count,
        "DownloadRanking": rank,
        "CoverageData": {},
    }

    # Create a temporary directory
    temp_dir = tempfile.mkdtemp()
    stub_package_dir = tempfile.mkdtemp() if has_stub_package else None

    try:
        print(f"Analyzing package: {package_name} rank {rank}")

        # Download and extract package files
        files, has_py_typed_file = extract_files(package_name, temp_dir)

        # Separate test and non-test files
        non_test_files = separate_test_files(files)

        stub_has_py_typed_file = False
        if stub_package_dir:
            stub_package_files, stub_has_py_typed_file = extract_files(
                package_name + "-stubs", stub_package_dir
            )
            files = merge_files_with_stubs(files, stub_package_files)
            non_test_files = merge_files_with_stubs(non_test_files, stub_package_files)

        package_report["HasPyTypedFile"] = has_py_typed_file or stub_has_py_typed_file

        non_test_coverage = calculate_overall_coverage(non_test_files)
        parameter_coverage = non_test_coverage["parameter_coverage"]
        return_type_coverage = non_test_coverage["return_type_coverage"]
        skipped_files_non_tests = non_test_coverage["skipped_files"]

        package_report["CoverageData"]["parameter_coverage"] = parameter_coverage
        package_report["CoverageData"]["return_type_coverage"] = return_type_coverage
        package_report["SurfaceArea"] = non_test_coverage["surface_area"]

        total_test_coverage = calculate_overall_coverage(files)
        skipped_tests = total_test_coverage["skipped_files"]

        package_report["CoverageData"]["param_coverage_with_tests"] = (
            total_test_coverage["parameter_coverage"]
        )
        package_report["CoverageData"]["return_coverage_with_tests"] = (
            total_test_coverage["return_type_coverage"]
        )

        # Merge non-test files with typeshed stubs
        typeshed_exists = check_typeshed(package_name)
        package_report["HasTypeShed"] = typeshed_exists
        package_report["HasStubsPackage"] = has_stub_package
        package_report["non_typeshed_stubs"] = "N/A"
        if typeshed_exists:
            if not parallel:
                print(f"Typeshed exists for {package_name}. Including it in analysis.")
            stub_files = find_stub_files(package_name)
            merged_files = merge_files_with_stubs(non_test_files, stub_files)

            # Calculate coverage with stubs
            total_test_coverage_stubs = calculate_overall_coverage(merged_files)
            parameter_coverage_with_stubs = total_test_coverage_stubs[
                "parameter_coverage"
            ]
            return_type_coverage_with_stubs = total_test_coverage_stubs[
                "return_type_coverage"
            ]
            skipped_files_with_stubs = total_test_coverage["skipped_files"]
        else:
            # Check for PyPI stub package if no Typeshed stubs exist
            stub_package_url = find_stub_package(package_name)
            if stub_package_url:
                print(f"Found non-typeshed stub package: {stub_package_url}")
                package_report["non_typeshed_stubs"] = stub_package_url

                # Download and merge PyPI stub files
                stub_temp_dir = tempfile.mkdtemp()
                try:
                    stub_files, _ = extract_files(
                        f"{package_name}-stubs", stub_temp_dir
                    )
                    merged_files = merge_files_with_stubs(non_test_files, stub_files)

                    # Calculate coverage with stubs
                    total_test_coverage_stubs = calculate_overall_coverage(merged_files)
                    parameter_coverage_with_stubs = total_test_coverage_stubs[
                        "parameter_coverage"
                    ]
                    return_type_coverage_with_stubs = total_test_coverage_stubs[
                        "return_type_coverage"
                    ]
                    skipped_files_with_stubs = total_test_coverage["skipped_files"]
                finally:
                    print("temp file removed", stub_temp_dir)
                    shutil.rmtree(stub_temp_dir)
            else:
                print(f"No stubs found for {package_name} in Typeshed or PyPI.")

                parameter_coverage_with_stubs = parameter_coverage
                return_type_coverage_with_stubs = return_type_coverage
                skipped_files_with_stubs = skipped_files_non_tests

        # Add typeshed data if available
        if typeshed_data and package_name in typeshed_data:
            package_report["TypeshedData"] = typeshed_data[package_name]
        else:
            package_report["TypeshedData"] = {}

        package_report["CoverageData"]["parameter_coverage_with_stubs"] = (
            parameter_coverage_with_stubs
        )
        package_report["CoverageData"]["return_type_coverage_with_stubs"] = (
            return_type_coverage_with_stubs
        )

        skipped_files_total = max(skipped_files_with_stubs, skipped_tests)
        package_report["CoverageData"]["skipped_files"] = skipped_files_total

        # Write CLI
        if not parallel:
            generate_report(package_report, package_name)
    finally:
        # Clean up the temporary directory
        shutil.rmtree(temp_dir)
        if stub_package_dir:
            shutil.rmtree(stub_package_dir)

    return package_report


def get_packages_with_stubs() -> set[str]:
    with open(STUB_PACKAGES, "r") as f:
        data = json.load(f)
    return set(data)


def analyze_package_concurrently(
    package_data: dict[str, Any],
    rank: int,
    typeshed_data: dict[str, dict[str, Any]],
    packages_with_stubs: set[str],
) -> tuple[str, dict[str, Any]] | None:
    package_name = package_data["project"]
    download_count = package_data["download_count"]

    if package_name:  # Ensure package_name is not None
        return package_name, analyze_package(
            package_name,
            rank=rank,
            download_count=download_count,
            typeshed_data=typeshed_data,
            has_stub_package=package_name in packages_with_stubs,
            parallel=True,
        )
    return None


def parallel_analyze_packages(
    top_packages: list[dict[str, Any]],
    typeshed_data: dict[str, dict[str, Any]],
    packages_with_stubs: set[str],
) -> dict[str, Any]:
    package_report: dict[str, Any] = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(
                analyze_package_concurrently,
                package_data,
                rank,
                typeshed_data,
                packages_with_stubs,
            ): package_data
            for rank, package_data in enumerate(top_packages, start=1)
        }
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                package_name, analysis_result = result
                package_report[package_name] = analysis_result
    sorted_pairs = sorted(
        [pair for pair in package_report.items()], key=lambda x: x[1]["DownloadRanking"]
    )
    package_report = {k: v for k, v in sorted_pairs}
    return package_report


def read_packages(file_path: str) -> list[str]:
    try:
        with open(file_path, "r") as f:
            return [line.strip() for line in f.readlines()]
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []


def main(
    top_n: Optional[int] = None,
    package_name: Optional[str] = None,
    write_json: bool = False,
    write_html: bool = False,
    parallel: bool = False,
    create_daily: bool = False,
    package_list: Optional[str] = None,
    pyright_stats: Optional[bool] = False,
    output_list_only: Optional[bool] = False,
    prioritized: bool = False,
) -> None:
    package_report: dict[str, Any] = {}

    # Download the CSV file with typeshed stats
    typeshed_data = download_typeshed_csv()
    packages_with_stubs = get_packages_with_stubs()
    top_packages: list[dict[str, Any]] = []

    if package_name:
        # Analyze a specific package
        print(f"Analyzing specific package: {package_name}")
        package_report[package_name] = analyze_package(
            package_name,
            typeshed_data=typeshed_data,
            has_stub_package=package_name in packages_with_stubs,
        )
        top_packages = [package_report[package_name]]
    else:
        # Analyze top N packages
        sorted_packages = load_and_sort_top_packages(TOP_PYPI_PACKAGES)
        top_packages = sorted_packages[:top_n]

        if package_list:
            included_packages: list[str] = read_packages(package_list)
            top_packages = [
                package
                for package in top_packages
                if package["project"] in included_packages
            ]

        if parallel:
            package_report = parallel_analyze_packages(
                top_packages, typeshed_data, packages_with_stubs
            )
        else:
            for rank, package_data in enumerate(top_packages, start=1):
                name = package_data["project"]
                download_count = package_data["download_count"]
                package_report[name] = analyze_package(
                    name,
                    rank=rank,
                    download_count=download_count,
                    typeshed_data=typeshed_data,
                    has_stub_package=package_name in packages_with_stubs,
                )

    # Call pyright stats and merge data
    if pyright_stats:
        pyright_package_stats: dict[str, Any] = get_pyright_stats(
            [
                {
                    "package_name": package_data["project"],
                    "has_py_typed": package_report[package_data["project"]][
                        "HasPyTypedFile"
                    ],
                }
                for package_data in top_packages
            ]
        )

        for package, stats in pyright_package_stats.items():
            if package in package_report:
                package_report[package]["pyright_stats"] = stats

    html_report_file = HTML_REPORT_FILE
    json_report_file = JSON_REPORT_FILE
    historical_json_dir = os.path.join(HISTORICAL_DATA_DIR, "json")

    if output_list_only:
        os.makedirs("prioritized", exist_ok=True)
        html_report_file: str = os.path.join("prioritized", "index.html")
        json_report_file: str = os.path.join("prioritized", "package_report.json")
        historical_json_dir: str = os.path.join("prioritized", historical_json_dir)

    # Archive old report in data section
    if create_daily:
        archive_old_reports(historical_json_dir, json_report_file)

    # Conditionally write the JSON report
    if write_json:
        with open(json_report_file, "w") as json_file:
            json.dump(package_report, json_file, indent=4)
        print(f"{json_report_file} file generated.")

    # Conditionally generate the HTML report
    if write_html:
        generate_report_html(package_report, html_report_file, prioritized=prioritized)
        print("HTML report generated.")

    if create_daily:
        # Generate dates.json manifest for dynamic HTML pages (no static HTML generation needed)
        generate_dates_manifest(historical_json_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Analyze Python package type coverage."
    )
    parser.add_argument(
        "top_n", type=int, nargs="?", help="Analyze the top N PyPI packages."
    )
    parser.add_argument(
        "--package-name", type=str, help="Analyze a specific package by name."
    )
    parser.add_argument(
        "--write-json", action="store_true", help="Write the output to a JSON report."
    )
    parser.add_argument(
        "--write-html", action="store_true", help="Generate an HTML report."
    )
    parser.add_argument(
        "--create-daily",
        action="store_true",
        help="Create a daily report and archive previous data.",
    )
    parser.add_argument(
        "--parallel", action="store_true", help="Analyze packages in parallel."
    )
    parser.add_argument(
        "--pyright-stats", action="store_true", help="Run pyright stats."
    )
    parser.add_argument(
        "--package-list", type=str, help="Path to a file containing a list of packages."
    )

    parser.add_argument(
        "--output-list-only", action="store_true", help="Run pyright stats."
    )

    parser.add_argument(
        "--archive-prioritized", action="store_true", help="Run pyright stats."
    )

    args = parser.parse_args()

    if args.create_daily:
        main(
            top_n=(args.top_n or 8000),
            package_name=args.package_name,
            write_json=True,
            write_html=False,  # Dynamic HTML loads from JSON, no static generation needed
            create_daily=True,
        )
    elif args.package_name:
        main(
            package_name=args.package_name,
            write_json=args.write_json,
            write_html=args.write_html,
        )
    elif args.top_n:
        if not (1 <= args.top_n <= 8000):
            print("Error: <top_n> must be an integer between 1 and 8000.")
            sys.exit(1)
        main(
            top_n=args.top_n,
            write_json=args.write_json,
            write_html=args.write_html,
            parallel=args.parallel,
            pyright_stats=args.pyright_stats,
        )
    elif args.archive_prioritized:
        if not args.package_list:
            print("Error: Please provide a package list.")
            sys.exit(1)
        main(
            package_list=args.package_list,
            write_json=True,
            write_html=False,  # Dynamic HTML loads from JSON, no static generation needed
            parallel=args.parallel,
            pyright_stats=True,
            output_list_only=True,
            create_daily=True,
            prioritized=True,
        )
    elif args.package_list:
        main(
            package_list=args.package_list,
            write_json=args.write_json,
            write_html=args.write_html,
            parallel=args.parallel,
            pyright_stats=args.pyright_stats,
            output_list_only=args.output_list_only,
        )
    else:
        print("Error: Either provide a top N number or a package name.")
        sys.exit(1)