Python

My dataset is a sales transactions history of an online store. I need to create a category based on the texts in the Description column. I have done some text pre-processing and clustering. This is how the dataframe cat_df head looks like:

	Description	Text	Cluster9
0	WHITE HANGING HEART T-LIGHT HOLDER	white hanging heart t-light holder	1
1	WHITE METAL LANTERN	white metal lantern	4
2	CREAM CUPID HEARTS COAT HANGER	cream cupid hearts coat hanger	0
3	KNITTED UNION FLAG HOT WATER BOTTLE	knitted union flag hot water bottle	8
4	RED WOOLLY HOTTIE WHITE HEART	red woolly hottie white heart	1

I created a groupby for each cluster:

cluster9 = cat_df.groupby(['Cluster9'])['Text'].unique()

cluster9[0]
array(['cream cupid hearts coat hanger', 'hand warmer union jack',
       'hand warmer red polka dot', ..., 'set 10 cards snowy robin 17099',
       'set 10 cards swirly xmas tree 17104', 'letter "u" bling key ring'],
      dtype=object)

JavaScript
​x
 
cluster9 = cat_df.groupby(['Cluster9'])['Text'].unique()
​
cluster9[0]
array(['cream cupid hearts coat hanger', 'hand warmer union jack',
       'hand warmer red polka dot', ..., 'set 10 cards snowy robin 17099',
       'set 10 cards swirly xmas tree 17104', 'letter "u" bling key ring'],
      dtype=object)
​

Now I want to tokenize and count the words per cluster index.

from nltk.tokenize import word_tokenize
from collections import Counter

counter = Counter(word_tokenize(cluster9[0]))

JavaScript
 
from nltk.tokenize import word_tokenize
from collections import Counter
​
counter = Counter(word_tokenize(cluster9[0]))
​

But I got an error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:UsersMARTIN~1AppDataLocalTemp/ipykernel_11488/1782743253.py in <module>
      2 from collections import Counter
      3 
----> 4 counter = Counter(word_tokenize(cluster9[0]))

~anaconda3libsite-packagesnltktokenize__init__.py in word_tokenize(text, language, preserve_line)
    127     :type preserve_line: bool
    128     """
--> 129     sentences = [text] if preserve_line else sent_tokenize(text, language)
    130     return [
    131         token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)

~anaconda3libsite-packagesnltktokenize__init__.py in sent_tokenize(text, language)
    105     """
    106     tokenizer = load(f"tokenizers/punkt/{language}.pickle")
--> 107     return tokenizer.tokenize(text)
    108 
    109 

~anaconda3libsite-packagesnltktokenizepunkt.py in tokenize(self, text, realign_boundaries)
   1275         Given a text, returns a list of the sentences in that text.
   1276         """
-> 1277         return list(self.sentences_from_text(text, realign_boundaries))
   1278 
   1279     def debug_decisions(self, text):

~anaconda3libsite-packagesnltktokenizepunkt.py in sentences_from_text(self, text, realign_boundaries)
   1332         follows the period.
   1333         """
-> 1334         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1335 
   1336     def _slices_from_text(self, text):

~anaconda3libsite-packagesnltktokenizepunkt.py in <listcomp>(.0)
   1332         follows the period.
   1333         """
-> 1334         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1335 
   1336     def _slices_from_text(self, text):

~anaconda3libsite-packagesnltktokenizepunkt.py in span_tokenize(self, text, realign_boundaries)
   1322         if realign_boundaries:
   1323             slices = self._realign_boundaries(text, slices)
-> 1324         for sentence in slices:
   1325             yield (sentence.start, sentence.stop)
   1326 

~anaconda3libsite-packagesnltktokenizepunkt.py in _realign_boundaries(self, text, slices)
   1363         """
   1364         realign = 0
-> 1365         for sentence1, sentence2 in _pair_iter(slices):
   1366             sentence1 = slice(sentence1.start + realign, sentence1.stop)
   1367             if not sentence2:

~anaconda3libsite-packagesnltktokenizepunkt.py in _pair_iter(iterator)
    317     iterator = iter(iterator)
    318     try:
--> 319         prev = next(iterator)
    320     except StopIteration:
    321         return

~anaconda3libsite-packagesnltktokenizepunkt.py in _slices_from_text(self, text)
   1336     def _slices_from_text(self, text):
   1337         last_break = 0
-> 1338         for match in self._lang_vars.period_context_re().finditer(text):
   1339             context = match.group() + match.group("after_tok")
   1340             if self.text_contains_sentbreak(context):

TypeError: cannot use a string pattern on a bytes-like object

JavaScript
 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:UsersMARTIN~1AppDataLocalTemp/ipykernel_11488/1782743253.py in <module>
      2 from collections import Counter
      3 
----> 4 counter = Counter(word_tokenize(cluster9[0]))
​
~anaconda3libsite-packagesnltktokenize__init__.py in word_tokenize(text, language, preserve_line)
    127     :type preserve_line: bool
    128     """
--> 129     sentences = [text] if preserve_line else sent_tokenize(text, language)
    130     return [
    131         token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
​
~anaconda3libsite-packagesnltktokenize__init__.py in sent_tokenize(text, language)
    105     """
    106     tokenizer = load(f"tokenizers/punkt/{language}.pickle")
--> 107     return tokenizer.tokenize(text)
    108 
    109 
​
~anaconda3libsite-packagesnltktokenizepunkt.py in tokenize(self, text, realign_boundaries)
   1275         Given a text, returns a list of the sentences in that text.
   1276         """
-> 1277         return list(self.sentences_from_text(text, realign_boundaries))
   1278 
   1279     def debug_decisions(self, text):
​
~anaconda3libsite-packagesnltktokenizepunkt.py in sentences_from_text(self, text, realign_boundaries)
   1332         follows the period.
   1333         """
-> 1334         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1335 
   1336     def _slices_from_text(self, text):
​
~anaconda3libsite-packagesnltktokenizepunkt.py in <listcomp>(.0)
   1332         follows the period.
   1333         """
-> 1334         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1335 
   1336     def _slices_from_text(self, text):
​
~anaconda3libsite-packagesnltktokenizepunkt.py in span_tokenize(self, text, realign_boundaries)
   1322         if realign_boundaries:
   1323             slices = self._realign_boundaries(text, slices)
-> 1324         for sentence in slices:
   1325             yield (sentence.start, sentence.stop)
   1326 
​
~anaconda3libsite-packagesnltktokenizepunkt.py in _realign_boundaries(self, text, slices)
   1363         """
   1364         realign = 0
-> 1365         for sentence1, sentence2 in _pair_iter(slices):
   1366             sentence1 = slice(sentence1.start + realign, sentence1.stop)
   1367             if not sentence2:
​
~anaconda3libsite-packagesnltktokenizepunkt.py in _pair_iter(iterator)
    317     iterator = iter(iterator)
    318     try:
--> 319         prev = next(iterator)
    320     except StopIteration:
    321         return
​
~anaconda3libsite-packagesnltktokenizepunkt.py in _slices_from_text(self, text)
   1336     def _slices_from_text(self, text):
   1337         last_break = 0
-> 1338         for match in self._lang_vars.period_context_re().finditer(text):
   1339             context = match.group() + match.group("after_tok")
   1340             if self.text_contains_sentbreak(context):
​
TypeError: cannot use a string pattern on a bytes-like object
​

How do I convert cluster9[0] into just one long string so I can pass it to word_tokenize and Counter?

I also tried spacy.

import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(cluster9[0])

Error:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:UsersMARTIN~1AppDataLocalTemp/ipykernel_11488/3019562737.py in <module>
      1 import spacy
      2 nlp = spacy.load("en_core_web_sm")
----> 3 doc = nlp(cluster9[0])

~anaconda3libsite-packagesspacylanguage.py in __call__(self, text, disable, component_cfg)
    982         DOCS: https://spacy.io/api/language#call
    983         """
--> 984         doc = self.make_doc(text)
    985         if component_cfg is None:
    986             component_cfg = {}

~anaconda3libsite-packagesspacylanguage.py in make_doc(self, text)
   1064                 Errors.E088.format(length=len(text), max_length=self.max_length)
   1065             )
-> 1066         return self.tokenizer(text)
   1067 
   1068     def update(

TypeError: Argument 'string' has incorrect type (expected str, got numpy.ndarray)

JavaScript
 
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(cluster9[0])
​
Error:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:UsersMARTIN~1AppDataLocalTemp/ipykernel_11488/3019562737.py in <module>
      1 import spacy
      2 nlp = spacy.load("en_core_web_sm")
----> 3 doc = nlp(cluster9[0])
​
~anaconda3libsite-packagesspacylanguage.py in __call__(self, text, disable, component_cfg)
    982         DOCS: https://spacy.io/api/language#call
    983         """
--> 984         doc = self.make_doc(text)
    985         if component_cfg is None:
    986             component_cfg = {}
​
~anaconda3libsite-packagesspacylanguage.py in make_doc(self, text)
   1064                 Errors.E088.format(length=len(text), max_length=self.max_length)
   1065             )
-> 1066         return self.tokenizer(text)
   1067 
   1068     def update(
​
TypeError: Argument 'string' has incorrect type (expected str, got numpy.ndarray)
​

Any help and suggestions will be appreciated. Thank you.

Answer

Taken your data and created dummy dataframe for the same .

import pandas as pd
from collections import Counter

df = pd.DataFrame([['white hanging heart t-light holder','1'],
                   ['white metal lantern','4'],
                   ['red woolly hottie white heart','1'],
                   ['knitted union flag hot water bottle','4']], columns  = ['text', 'cluster'])

# join text based on group , so in above case I have created two cluster , 1 and 4
group_by_text = df.groupby('cluster').agg({'text': ' '.join})

# Now you can count the word using counter 

group_by_text['text'].apply(lambda x: Counter(x.split())).reset_index()

JavaScript
 
import pandas as pd
from collections import Counter
​
df = pd.DataFrame([['white hanging heart t-light holder','1'],
                   ['white metal lantern','4'],
                   ['red woolly hottie white heart','1'],
                   ['knitted union flag hot water bottle','4']], columns  = ['text', 'cluster'])
​
# join text based on group , so in above case I have created two cluster , 1 and 4
group_by_text = df.groupby('cluster').agg({'text': ' '.join})
​
# Now you can count the word using counter 
​
group_by_text['text'].apply(lambda x: Counter(x.split())).reset_index()
​

you will get the desired ouptut

How to resolve TypeError: cannot use a string pattern on a bytes-like object – word_tokenize, Counter and spacy

Advertisement

Answer