I have a rule-based code that prints out the Noun which is followed by a verb in a sentence
for text_id, text in enumerate(news_df['news_title'].values): # Remove the comma and full stops text = text.replace(',', '').replace('.', '').replace('-','') sentence_tags = POSTAG(text.lower()) print(text) # Sentences parts for index, part in enumerate(sentence_tags): try: if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]: print(">", part[0]) break elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]: print(">", part[0], sentence_tags[index + 1][0]) break elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]: print(">", part[0], sentence_tags[index + 1][0], sentence_tags[index + 2][0]) break except: pass print()
The output of a sentence following this rule:
high school football players charged after video surfaces showing hazing
> school football players
trump accuser pushes new york to pass the adult survivors act plans to sue
>trump accuser
Is there a way to also print out the position of that Noun that was printed due to the rule? for example :
>trump accuser , [0,5,"NN"] , [6,13,"VB"]
Advertisement
Answer
I changed the script and separated the state machine
segment. The most serious problem with this program IMO is it’s just returning the first pattern (you can fix it quickly).
import pandas as pd import nltk POSTAG = nltk.pos_tag df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']}) for text_id, text in enumerate(df['text'].values): # Remove the comma and full stops text = text.replace(',', '').replace('.', '').replace('-','') tokens = nltk.word_tokenize(text.lower()) sentence_tags = POSTAG(tokens) words = [item[0] for item in sentence_tags] start_end = [] temp = 0 for word in words: start_end.append([temp, temp+len(word)]) temp+= (len(word)+1) tags = [item[1] for item in sentence_tags] words_to_print = [] tags_to_print = [] start_end_to_print = [] # the state machine verb = False first_noun = False second_noun = False third_noun = False for w, t, se in zip(words, tags, start_end): if t.startswith('NN'): words_to_print.append(w) tags_to_print.append(t) start_end_to_print.append(se) first_noun = True elif t.startswith('NN') and first_noun: words_to_print.append(w) tags_to_print.append(t) start_end_to_print.append(se) second_noun = True elif t.startswith('NN') and second_noun: words_to_print.append(w) tags_to_print.append(t) start_end_to_print.append(se) third_noun = True elif t.startswith('VB') and (first_noun or second_noun or third_noun): break elif (first_noun or second_noun or third_noun): words_to_print = [] tags_to_print = [] start_end_to_print = [] verb = False first_noun, second_noun, third_noun = False, False, False print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))
output:
> school football players [5, 11] NN [12, 20] NN [21, 28] NNS > trump accuser [0, 5] NN [6, 13] NN