Source code for twitterpersona.preprocessing

import pandas as pd
import preprocessor as p
import nltk
from nltk.corpus import stopwords


[docs]def generalPreprocessing(df: pd.DataFrame) -> pd.DataFrame: ''' Perform general preprocessing on df. Removes retweets/favourites and cleans URLs, Mentions, Numbers, and stop words. Parameters ---------- df : pd.DataFrame A dataframe storing all the raw data with text column. output_path : str the path that the newly generated csv should located at. Returns ------- df : pd.DataFrame The processed tweet dataframe. Examples -------- generalPreprocessing(df) ''' # remove retweets/favourites rt_fav_pattern = r'\b(RT|FAV)\b' filter = df['text'].str.contains(rt_fav_pattern, regex=True) df = df[~filter] # remove URL, Mentions, and Numbers p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER) df['text_clean'] = df['text'].apply(lambda x: p.clean(x)) # remove rows withstop words nltk.download('stopwords') df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')])) return df