-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathloop_detection_three_hop.py
More file actions
111 lines (100 loc) · 5.01 KB
/
loop_detection_three_hop.py
File metadata and controls
111 lines (100 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm
def detect_connection(node_subject, node_object, max_hop ,data):
#TODO detect two hop connection between node_subject
connection_count = 0
return connection_count
data = pd.read_table('/data/umls/train.txt',
sep = '\t', header = None)
unique_entities = np.unique(list(data[0].unique()) + list(data[2].unique()))
unique_entities_sub = list(data[0].unique())
unique_relations = list(data[1].unique())
unique_s_o_pairs = list(product(data[0].unique(), data[1].unique()))
#Create the indexes
existing_triples = {}
for triple in np.array(data):
#mark triples which exists in the dataset
existing_triples[triple[0], triple[1], triple[2]] = 1
data_arr = np.array(data)
#loop_candidates = [data.loc[(data[0]==data_arr[i][2])] for i in range(len(data_arr))]
already_checked_two_hop = []
already_checked_three_hop = []
two_hop_count =0
three_hop_count = 0
df = pd.DataFrame()
for entity in tqdm(unique_entities_sub):
#for triple in data_arr:
sub = entity
#print(already_checked_two_hop)
#if sub not in already_checked_two_hop:
#print('first time')
candidate_objects = data.loc[(data[0]==sub)][2].unique()
for sub_2 in candidate_objects:
#print(sub)
candidate_objects_subject_df = data.loc[(data[0]==sub_2) & (data[2]!= sub_2)]
candidate_objects_subject_df_value = data.loc[(data[0] == sub_2) & (data[2] != sub_2)][2]
if (len(candidate_objects_subject_df)>0) and (sub in np.array(candidate_objects_subject_df_value) ):
continue
#print(candidate_objects_subject_df)
#exit()
#unique_entity_sub_2 = np.copy(candidate_objects)
if (sub_2 not in already_checked_three_hop):
#candidate_objects_2 = candidate_objects_subject_df.loc[(candidate_objects_subject_df[2][2].unique()
candidate_objects_2 = candidate_objects_subject_df[2].unique()
#print(candidate_objects_2)
#exit()
#exist = set(already_checked_three_hop) - set(candidate_objects_2)
truth_table = [candidate_objects_2[i] not in already_checked_three_hop for i in range(len(candidate_objects_2)) ]
#if (len(candidate_objects_2)>0) and (candidate_objects_2[i] not in already_checked_three_hop):
if (len(candidate_objects_2) > 0) and (all(truth_table)):
#if (candidate_objects_2 not in already_checked_three_hop)
candidate_objects_subject_df_2 = data.loc[(data[0].isin(candidate_objects_2)) & (data[2]==sub)]
third_node_df = data.loc[(data[0]==candidate_objects_2[0]) & (data[2]==sub)]
#print(candidate_objects_subject_df_2)
if len(third_node_df)>0:
three_hop_count+=1
already_checked_three_hop.append(sub_2)
already_checked_three_hop.append(candidate_objects_2[0])
df = df.append(pd.DataFrame(np.array([sub,sub_2,candidate_objects_2[0]])).T)
#print(np.array([sub,sub_2,candidate_objects_2[0]]))
#rint(sub,sub_2,candidate_objects_2[0])
#print(len(candidate_objects))
#exit()
#candidate_objects_as_subject = data.loc[(data[0].isin(np.unique(candidate_objects)))][2]
#print(candidate_objects_as_subject)
#exit()
#print(candidate_objects_subject_df[2])
#exit()
#T5 2405101ODO here to implement three hop loop
#probable_candidate = data.loc[(data[2]==sub)]#& (data[0]!=sub)
#print('candidate triples', probable_candidate)
#print('#################################')
# if len(candidate_objects_subject_df)>0:
# #print('sub',, sub)
# #print('obj', candidate_objects)
# #print('candidate_objects subject', candidate_objects_subject)
# two_hop_count+=1
# #already_checked.append(sub)
# already_checked_two_hop.extend(np.array(candidate_objects_subject_df[0]))
#exit()
#print(two_hop_count)
print(three_hop_count)
# for s_o in unique_s_s1o_pairs:
# for triple in np.array(data):
# try:
# other_side_exists = existing_triples[triple[0],s_o[1],s_o[0]]
# node_subject=triple[0]
# node_object = data.loc[(data[0]==s_o[0]) & (data[1]==s_o[1])][2].unique()
# print('node_subject', node_subject)
# if len(node_object)>0:
# print('node_object', node_object)
# print('node_relation', s_o[1])
# probable_objects_for_s = data.loc[(data[0]==node_subject)][2].unique()
# print('probable_object_for_node_subject', probable_objects_for_s)
# #detect_connection(node_subject, node_object, probable_objects_for_s ,data)
# print5 2405101('##########################################')
# #TODO we need to get the distance between node_subject and node_object
# except:
# continue