I have a rule-based code that prints out the Noun which is followed by a verb in a sentence
JavaScript
x
28
28
1
for text_id, text in enumerate(news_df['news_title'].values):
2
3
# Remove the comma and full stops
4
text = text.replace(',', '').replace('.', '').replace('-','')
5
sentence_tags = POSTAG(text.lower())
6
7
print(text)
8
9
# Sentences parts
10
for index, part in enumerate(sentence_tags):
11
try:
12
13
if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]:
14
print(">", part[0])
15
break
16
17
elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]:
18
print(">", part[0], sentence_tags[index + 1][0])
19
break
20
21
elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]:
22
print(">", part[0], sentence_tags[index + 1][0], sentence_tags[index + 2][0])
23
break
24
25
except:
26
pass
27
print()
28
The output of a sentence following this rule:
high school football players charged after video surfaces showing hazing
JavaScript
1
2
1
> school football players
2
trump accuser pushes new york to pass the adult survivors act plans to sue
JavaScript
1
2
1
>trump accuser
2
Is there a way to also print out the position of that Noun that was printed due to the rule? for example :
JavaScript
1
2
1
>trump accuser , [0,5,"NN"] , [6,13,"VB"]
2
Advertisement
Answer
I changed the script and separated the state machine
segment. The most serious problem with this program IMO is it’s just returning the first pattern (you can fix it quickly).
JavaScript
1
57
57
1
import pandas as pd
2
import nltk
3
POSTAG = nltk.pos_tag
4
df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']})
5
for text_id, text in enumerate(df['text'].values):
6
7
# Remove the comma and full stops
8
text = text.replace(',', '').replace('.', '').replace('-','')
9
tokens = nltk.word_tokenize(text.lower())
10
sentence_tags = POSTAG(tokens)
11
words = [item[0] for item in sentence_tags]
12
start_end = []
13
temp = 0
14
for word in words:
15
start_end.append([temp, temp+len(word)])
16
temp+= (len(word)+1)
17
tags = [item[1] for item in sentence_tags]
18
words_to_print = []
19
tags_to_print = []
20
start_end_to_print = []
21
# the state machine
22
verb = False
23
first_noun = False
24
second_noun = False
25
third_noun = False
26
for w, t, se in zip(words, tags, start_end):
27
if t.startswith('NN'):
28
words_to_print.append(w)
29
tags_to_print.append(t)
30
start_end_to_print.append(se)
31
first_noun = True
32
33
elif t.startswith('NN') and first_noun:
34
words_to_print.append(w)
35
tags_to_print.append(t)
36
start_end_to_print.append(se)
37
second_noun = True
38
39
elif t.startswith('NN') and second_noun:
40
words_to_print.append(w)
41
tags_to_print.append(t)
42
start_end_to_print.append(se)
43
third_noun = True
44
45
elif t.startswith('VB') and (first_noun or second_noun or third_noun):
46
break
47
48
elif (first_noun or second_noun or third_noun):
49
words_to_print = []
50
tags_to_print = []
51
start_end_to_print = []
52
verb = False
53
first_noun, second_noun, third_noun = False, False, False
54
55
print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))
56
57
output:
JavaScript
1
3
1
> school football players [5, 11] NN [12, 20] NN [21, 28] NNS
2
> trump accuser [0, 5] NN [6, 13] NN
3