Source code for twitterpersona.preprocessing

import pandas as pd
import preprocessor as p
import nltk
from nltk.corpus import stopwords


[docs]def generalPreprocessing(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Perform general preprocessing on df. Removes retweets/favourites and cleans URLs, Mentions, Numbers, and stop words.  
    
    Parameters
    ----------
    df : pd.DataFrame
        A dataframe storing all the raw data with text column.
    
    output_path : str
        the path that the newly generated csv should located at.
    
    Returns
    -------
    df : pd.DataFrame
        The processed tweet dataframe.
        
    Examples
    --------
    generalPreprocessing(df)
    '''

    # remove retweets/favourites
    rt_fav_pattern = r'\b(RT|FAV)\b'
    filter = df['text'].str.contains(rt_fav_pattern, regex=True)
    df = df[~filter]

    # remove URL, Mentions, and Numbers
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
    df['text_clean'] = df['text'].apply(lambda x: p.clean(x))

    # remove rows withstop words
    nltk.download('stopwords')
    df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

    return df