Source code for murphy.nlp_tools

from dask.dataframe import DataFrame as dask_dataframe
import pandas as pd
import en_core_web_sm
import nltk

from typing import Union

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    import nltk

from nltk.corpus import stopwords


[docs]class NLPTools: def __init__(self, tokenize: bool = True, filter_stopwords: bool = True, lemmatize: bool = True, language: str = 'english'): self.tokenize_flag = tokenize self.filter_stopwords_flag = filter_stopwords self.lemmatize_flag = lemmatize self.language = language self.stopwords = stopwords.words(self.language) nlp = en_core_web_sm.load() def _tokenize(self, string: str) -> str: tokens = nltk.word_tokenize(text=string, language=self.language) tokens_lst = list(filter(lambda word: word.isalnum(), tokens)) return ' '.join(tokens_lst) def _remove_stopwords(self, string: str): filtered = filter(lambda word: word not in self.stopwords, string.split(' ')) return ' '.join(filtered) @staticmethod def _lemmatize(string): doc = NLPTools.nlp(string) lemmatized = [token.lemma_ for token in doc] return ' '.join(lemmatized)
[docs] def tokenize_tweets(self, tweet_dataframe: Union[dask_dataframe, pd.DataFrame]) -> Union[ dask_dataframe, pd.DataFrame]: tweet_dataframe['text'] = tweet_dataframe['text'].apply( lambda text: self._tokenize(text), meta=str ) return tweet_dataframe
[docs] def filter_stopwords(self, tweet_dataframe: dask_dataframe) -> Union[dask_dataframe, pd.DataFrame]: tweet_dataframe['text'] = tweet_dataframe.apply( lambda x: self._remove_stopwords(x['text']), axis=1, meta=str ) return tweet_dataframe
[docs] def lemmatize_tweets(self, tweet_dataframe: dask_dataframe) -> Union[dask_dataframe, pd.DataFrame]: tweet_dataframe['text'] = tweet_dataframe.apply( lambda x: self._lemmatize(x['text']), axis=1, meta=str ) return tweet_dataframe
[docs] def run_tools(self, tweet_dataframe: dask_dataframe) -> Union[dask_dataframe, pd.DataFrame]: if self.tokenize_flag: tweet_dataframe = self.tokenize_tweets(tweet_dataframe) if self.filter_stopwords_flag: tweet_dataframe['text'] = tweet_dataframe.apply( lambda x: self._remove_stopwords(x['text']), axis=1, meta=str ) # tweet_dataframe = self.filter_stopwords(tweet_dataframe) if self.lemmatize_flag: tweet_dataframe['text'] = tweet_dataframe.apply( lambda x: self._lemmatize(x['text']), axis=1, meta=str ) return tweet_dataframe