-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithubMetricsToContractStats.py
More file actions
85 lines (68 loc) · 3.18 KB
/
githubMetricsToContractStats.py
File metadata and controls
85 lines (68 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Merges rows in 3-projects-stats.csv and constraintsAcrossVersions.csv by program name.
# Then, provides some statistics and plots between GitHub metrics and number of contracts in last version.
import csv
import pandas as pd
import matplotlib.pyplot as plt
datasetStatsFile = "./base/3-projects-stats.csv"
constraintsFile = "./results/evolution/constraintsAcrossVersions.csv"
# Merge CSV files.
def replace_slash(string):
return string.replace("/", "-")
with open(datasetStatsFile, 'r') as file1:
csv_reader1 = csv.reader(file1)
next(csv_reader1) # Skip the header row
data_dict = {}
for row in csv_reader1:
modified_col = replace_slash(row[1].lower()) # Convert to lowercase
row = row[1:]
data_dict[modified_col] = row
with open(constraintsFile, 'r') as file2:
csv_reader2 = csv.reader(file2)
next(csv_reader2)
merged_rows = []
matched_keys = set()
for row in csv_reader2:
program = row[0]
modified_program = replace_slash(program.lower())
if modified_program in data_dict:
merged_row = data_dict[modified_program] + row
merged_rows.append(merged_row)
matched_keys.add(modified_program)
with open('merged.csv', 'w', newline='') as merged_file:
csv_writer = csv.writer(merged_file)
csv_writer.writerow(['APPLICATION NAME', 'GITHUB LINK', 'LANGUAGE', 'WATCHERS', 'STARS', 'FORKS', 'CONTRIBUTORS',
'DATE OF LAST COMMIT', 'TOTAL MERGED PULL REQUESTS', 'TOTAL CLOSED PULL REQUESTS',
'% OF PULL REQUESTS ACCEPTED', 'program', 'version1', 'methods1', 'constraints1',
'version2', 'methods2', 'constraints2'])
csv_writer.writerows(merged_rows)
unmatched_rows = [data_dict[key] for key in data_dict if key not in matched_keys]
print("Unmatched rows:")
for row in unmatched_rows:
print(row)
df = pd.read_csv('./merged.csv')
# Relation between GitHub Metrics and Contracts dispersion plot
column_numbers_dispersion = [3, 4, 5, 6, 10, 17]
invalid_columns = [col_num for col_num in column_numbers_dispersion if col_num >= len(df.columns)]
filtered_df = df[(df.iloc[:, -1] >= 100) & (df.iloc[:, -1] <= 2000)]
df_filtered = filtered_df.iloc[:, column_numbers_dispersion]
for i in range(len(column_numbers_dispersion) - 1):
plt.scatter(df_filtered.iloc[:, -1], df_filtered.iloc[:, i], alpha=0.5)
plt.xlabel(df.columns[column_numbers_dispersion[-1]])
plt.ylabel(df.columns[column_numbers_dispersion[i]])
plt.title(f"{df.columns[column_numbers_dispersion[i]]} vs. {df.columns[column_numbers_dispersion[-1]]}")
plt.figure()
# GitHub Metrics Box plot
column_numbers_box = [3, 4, 5, 6]
box_labels = ["GitHub Watchers", "GitHub Stars", "GitHub Forks", "GitHub Contributors"]
x_labels = ["watchers", "stars", "forks", "contributors"]
index = 0
for i in column_numbers_box:
column_number = i
column_data = df.iloc[:, column_number]
plt.boxplot(column_data, showfliers=False)
plt.title(box_labels[index])
plt.xticks([], [], rotation=45)
plt.figure()
print(f"Max outlier in column {df.columns[column_number]}: {df.iloc[:, column_number].max()}")
index = index + 1
plt.show()