CodeChat/IssueToXML.py at main · yash1802/CodeChat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
import os
import nltk
from nltk.corpus import stopwords
import re
from XMLUtil import compress_XML

nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))

TOKEN = os.getenv('GITHUB_TOKEN', 'Enter your Github Personal Access Token')
headers = {"Authorization": f"token {TOKEN}"}

def escape_xml(text):
    return (
        str(text)
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
    )


def convert_issue(issue_url):
    """
    Transforms a given github repository's issue into XML format (stored as .txt)
    """
    url_parts = issue_url.split("/")
    repo_owner = url_parts[3]
    repo_name = url_parts[4]
    issue_number = url_parts[-1]

    api_base_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{issue_number}"
    headers = {"Authorization": f"token {TOKEN}"}

    response = requests.get(api_base_url, headers=headers)
    issue_data = response.json()

    comments_url = issue_data["comments_url"]
    comments_response = requests.get(comments_url, headers=headers)
    comments_data = comments_response.json()

    xml_text = f'<source type="github_issue" url="{issue_url}">\n'
    xml_text += '<issue_info>\n'
    xml_text += f'<title>{escape_xml(issue_data["title"])}</title>\n'
    xml_text += f'<description>{escape_xml(issue_data["body"])}</description>\n'
    xml_text += '<comments>\n'

    for comment in comments_data:
        xml_text += '<comment>\n'
        xml_text += f'<author>{escape_xml(comment["user"]["login"])}</author>\n'
        xml_text += f'<content>{escape_xml(comment["body"])}</content>\n'

        code_snippets = re.findall(r'https://github.com/.*#L\d+-L\d+', comment['body'])
        for snippet_url in code_snippets:
            url_parts = snippet_url.split("#")
            file_url = url_parts[0].replace("/blob/", "/raw/")
            line_range = url_parts[1]
            start_line, end_line = map(int, line_range.split("-")[0][1:]), map(int, line_range.split("-")[1][1:])

            file_response = requests.get(file_url, headers=headers)
            file_content = file_response.text

            code_lines = file_content.split("\n")[start_line - 1:end_line]
            code_snippet = "\n".join(code_lines)

            xml_text += '<code_snippet>\n'
            xml_text += f'<![CDATA[{code_snippet}]]>\n'
            xml_text += '</code_snippet>\n'

        xml_text += '</comment>\n'

    xml_text += '</comments>\n'
    xml_text += '</issue_info>\n'
    xml_text += '</source>'

    return xml_text


def convert_issue_to_xml(input_path):
    XML_file = "Issue_XML_output.txt"
    XML_file_compressed = "compressed_Issue_XML_output.txt"

    print("\nProcessing the github issue data...")
    final_output = convert_issue(input_path)

    with open(XML_file, "w", encoding="utf-8") as file:
        file.write(final_output)

    compress_XML(XML_file, XML_file_compressed)
    print("processing complete.")