-
Notifications
You must be signed in to change notification settings - Fork 18
/
syntacticmodule.py
183 lines (138 loc) · 7.19 KB
/
syntacticmodule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
from nltk import ngrams
from nltk.tokenize import word_tokenize
from rapidfuzz.distance import Levenshtein
class Syntactic:
""" A simple abstraction layer for using the Syntactic module of the CSO classifier """
def __init__(self, cso = None, paper = None):
"""Function that initialises an object of class CSOClassifierSyntactic and all its members.
Args:
cso (Ontology class): Computer Science Ontology
paper (Paper class): object containing the paper.
"""
# Initialise variables to store CSO data - loads into memory
self.cso = cso # the ontologo object
self.min_similarity = 0.90 # Value of minimum similarity
self.paper = paper # the paper object
self.explanation = dict() # the explanation dictionary
self.extracted_topics = dict() # dictionary with the extract topics (including similarity measures)
def set_paper(self, paper):
"""Function that initializes the paper variable in the class.
Args:
paper (either string or dictionary): The paper to analyse. It can be a full string in which the content
is already merged or a dictionary {"title": "","abstract": "","keywords": ""}.
"""
self.paper = paper
self.explanation = dict() #resets the dictionary (this is important if we work in batch mode)
def set_min_similarity(self, msm):
"""Function that sets a different value for the similarity.
Args:
msm (integer): similairity value.
"""
self.min_similarity = msm
def reset_explanation(self):
""" Resetting the explanation
"""
self.explanation = dict()
def get_explanation(self):
""" Returns the explanation
"""
return self.explanation
def classify_syntactic(self):
"""Function that classifies a single paper. If you have a collection of papers,
you must call this function for each paper and organise the result.
Initially, it cleans the paper file, removing stopwords (English ones) and punctuation.
Then it extracts n-grams (1,2,3) and with a Levenshtein it check the similarity for each of
them with the topics in the ontology.
Next, it climbs the ontology, by selecting either the first broader topic or the whole set of
broader topics until root is reached.
Args:
Returns:
final_topics (list): containing the list of final topics.
"""
final_topics = list()
# analysing similarity with terms in the ontology
self.extracted_topics = self.__statistic_similarity()
# stripping explanation
final_topics = self.__strip_service_fields(self.extracted_topics)
return final_topics
def get_syntactic_topics_weights(self):
"""Function that returns the full set of topics with the similarity measure (weights)
Args:
Returns:
weights (dictionary): containing the found topics with their similarity and the n-gram analysed.
"""
weights = dict()
for topic, sim_values in self.extracted_topics.items():
if len(sim_values) == 1:
weights[topic] = sim_values[0]["similarity"]
else:
weights[topic] = max([sim_value["similarity"] for sim_value in sim_values])
return weights
def __statistic_similarity(self):
"""Function that finds the similarity between the previously extracted concepts and topics in the ontology
Returns:
found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
"""
found_topics = dict()
concepts = self.paper.get_syntactic_chunks()
for concept in concepts:
matched_trigrams = set()
matched_bigrams = set()
for comprehensive_grams in self.__get_ngrams(concept):
position = comprehensive_grams["position"]
size = comprehensive_grams["size"]
grams = comprehensive_grams["ngram"]
# if we already matched the current token to a topic, don't reprocess it
if size <= 1 and (position in matched_bigrams or position-1 in matched_bigrams):
continue
if size <= 2 and (position in matched_trigrams or position-1 in matched_trigrams or position-2 in matched_trigrams):
continue
# otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown'
gram = " ".join(grams)
try:
# if there isn't an exact match on the first 4 characters of the ngram and a topic, move on
#topic_block = [key for key, _ in self.cso.topics.items() if key.startswith(gram[:4])]
topic_block = self.cso.topic_stems[gram[:4]]
except KeyError:
continue
for topic in topic_block:
# otherwise look for an inexact match
match_ratio = Levenshtein.normalized_similarity(topic, gram)
if match_ratio >= self.min_similarity:
try:
# if a 'primary label' exists for the current topic, use it instead of the matched topic
topic = self.cso.primary_labels[topic]
except KeyError:
pass
# note the tokens that matched the topic and how closely
if topic not in found_topics:
found_topics[topic] = list()
found_topics[topic].append({'matched': gram, 'similarity': match_ratio})
# don't reprocess the current token
if size == 2:
matched_bigrams.add(position)
elif size == 3:
matched_trigrams.add(position)
# explanation bit
if topic not in self.explanation:
self.explanation[topic] = set()
self.explanation[topic].add(gram)
return found_topics
def __get_ngrams(self, concept):
""" Function that returns n-grams of concept in reverse order (3,2, and 1)
"""
for n_size in range(3, 0, -1):
pos = 0
for ngram in ngrams(word_tokenize(concept, preserve_line=True), n_size):
yield {"position": pos, "size": n_size, "ngram": ngram}
pos += 1
def __strip_service_fields(self, found_topics):
"""Function that removes statistical values from the dictionary containing the found topics.
It returns only the topics. It removes the same as, picking the longest string in alphabetical order.
Args:
found_topics (dictionary): It contains the topics found with string similarity.
Returns:
topics (array): array containing the list of topics.
"""
topics = list(set(found_topics.keys())) # Takes only the keys
return topics