Creating Pipeline

Creating Pipeline

In this section, we will combine everything we discussed in “Feature Creation” and “Feature Selection”, and make a complete pipeline for preprocessing. This will be extremely useful when it comes to processing test set.

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re
from sklearn.preprocessing import OneHotEncoder
import joblib

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
      2 import numpy as np
      3 import nltk

ModuleNotFoundError: No module named 'pandas'
def text_processing(df, condition = "train"):
    """Extract text features from columns"""
    """Input: Data frame """
    """Output: Data frame of extracted text features with their TF-IDF"""
    
    df["description"] = df["description"].str.lower()
    df["requirements"] = df["requirements"].str.lower()
    df["benefits"] = df["benefits"].str.lower()
    df["title"] = df["title"].str.lower()
    
    # Remove all NA 
    df["description"].fillna(" ", inplace = True)
    df["requirements"].fillna(" ", inplace = True)
    df["benefits"].fillna(" ", inplace = True)
    df["title"].fillna(" ", inplace = True)
    
    # Remove unnecessary words and punctuation / stemming
    
    stop = set(stopwords.words('english'))
    stop = list(stop)

    def remove_URL(text):
        url = re.compile(r'#url_\w*#')
        return url.sub(r'url ',str(text))

    def remove_emoji(text):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r' ', str(text))

    def remove_html(text):
        html=re.compile(r'<.*?>')
        return html.sub(r'html ',str(text))

    def remove_stopwords(words):
        return ' '.join(word for word in str(words).split() if word not in stop)

    def remove_punctuation(words):
        return ' '.join(word.strip(string.punctuation) for word in str(words).split())

    def remove_dirty_words(words):
        dirty_words=re.compile(r'[^\x00-\x7F]+|(&amp)|\d|[^\w\s]')
        return dirty_words.sub(r' ',str(words))
    
    def stemSentence(sentence):
        porter=PorterStemmer()
        token_words=word_tokenize(sentence)
        token_words
        stem_sentence=[]
        for word in token_words:
            stem_sentence.append(porter.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)
    
    df["description"] = df.description.apply(remove_URL)
    df["description"] = df.description.apply(remove_html)
    df["description"] = df.description.apply(remove_emoji)
    df["description"] = df.description.apply(remove_dirty_words)
    df["description"] = df.description.apply(remove_punctuation)
    df["description"] = df.description.apply(remove_stopwords)
    df["description"] = df.description.apply(stemSentence)
    
    df["title"] = df.title.apply(remove_URL)
    df["title"] = df.title.apply(remove_html)
    df["title"] = df.title.apply(remove_emoji)
    df["title"] = df.title.apply(remove_dirty_words)
    df["title"] = df.title.apply(remove_punctuation)
    df["title"] = df.title.apply(remove_stopwords)
    df["title"] = df.title.apply(stemSentence)
    
    df["benefits"] = df.benefits.apply(remove_URL)
    df["benefits"] = df.benefits.apply(remove_html)
    df["benefits"] = df.benefits.apply(remove_emoji)
    df["benefits"] = df.benefits.apply(remove_dirty_words)
    df["benefits"] = df.benefits.apply(remove_punctuation)
    df["benefits"] = df.benefits.apply(remove_stopwords)
    df["benefits"] = df.benefits.apply(stemSentence)
    
    df["requirements"] = df.requirements.apply(remove_URL)
    df["requirements"] = df.requirements.apply(remove_html)
    df["requirements"] = df.requirements.apply(remove_emoji)
    df["requirements"] = df.requirements.apply(remove_dirty_words)
    df["requirements"] = df.requirements.apply(remove_punctuation)
    df["requirements"] = df.requirements.apply(remove_stopwords)
    df["requirements"] = df.requirements.apply(stemSentence)
    
    
    # Feature Extraction
    
    if condition == "test":
        with open('./pickle/vec_description.pkl', 'rb') as f:
            vec_description = pickle.load(f)
        
        with open('./pickle/vec_title.pkl', 'rb') as f:
            vec_title = pickle.load(f)
            
        with open('./pickle/vec_benefits.pkl', 'rb') as f:
            vec_benefits = pickle.load(f)
            
        with open('./pickle/vec_requirements.pkl', 'rb') as f:
            vec_requirements = pickle.load(f)
        
        tfidf_description = vec_description.transform(df["description"])
        features_name_desc = vec_description.get_feature_names_out() + "_desc"
        desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)

        tfidf_requirements = vec_requirements.transform(df["requirements"])
        features_name_req = vec_requirements.get_feature_names_out() + "_req"
        req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)

        tfidf_title = vec_title.transform(df["title"])
        features_name_title = vec_title.get_feature_names_out() + "_title"
        title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
        
        tfidf_benefits = vec_benefits.transform(df["benefits"])
        features_name_benefits = vec_benefits.get_feature_names_out() +"_benefits"
        benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
        
        feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
        
    else:

        vec_description = TfidfVectorizer(smooth_idf=True)
        tfidf_description = vec_description.fit_transform(df["description"])
        features_name_desc = vec_description.get_feature_names_out() + "_desc"
        desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)

        vec_requirements = TfidfVectorizer(smooth_idf=True)
        tfidf_requirements = vec_requirements.fit_transform(df["requirements"])
        features_name_req = vec_requirements.get_feature_names_out() + "_req"
        req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)

        vec_title = TfidfVectorizer(smooth_idf=True)
        tfidf_title = vec_title.fit_transform(df["title"])
        features_name_title = vec_title.get_feature_names_out() + "_title"
        title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)

        vec_benefits = TfidfVectorizer(smooth_idf=True)
        tfidf_benefits = vec_benefits.fit_transform(df["benefits"])
        features_name_benefits = vec_benefits.get_feature_names_out() + "_benefits"
        benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
        
        feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
        
        with open('./pickle/vec_description.pkl', 'wb') as f:
            pickle.dump(vec_description, f)
        
        with open('./pickle/vec_title.pkl', 'wb') as f:
            pickle.dump(vec_title, f)
            
        with open('./pickle/vec_benefits.pkl', 'wb') as f:
            pickle.dump(vec_benefits, f)
            
        with open('./pickle/vec_requirements.pkl', 'wb') as f:
            pickle.dump(vec_requirements, f)
        
    return feature_text
def location_processing(df):
    def extract_state(s):
        """ Extract state from the location"""
        """ The function can be used only when the state is formmated with two capital letter"""
        """ Input: Series, iterable object"""
        """ Output: List of States"""
    
        s.fillna("No Location", inplace = True)
        result = []    
        for i in np.arange(len(s)):
            if (s[i].__contains__("US")):
                extracted = re.findall(r'[A-Z]{2}', re.sub(r'[US]','',s[i]))
                # Edge Case 1: Posting is from US but State is not posted
                if extracted == []:
                    extracted = ["Domestic"]
                # Edge Case 2: Regex detect a city name as a State name
                if len(extracted) != 1:
                    while len(extracted) > 1:
                        extracted.pop()
                result += extracted
            else:
                # Edge Case 3: Location is not given 
                if s[i] == ["No Location"]:
                    result += s[i]
                # Edge Case 4: Location is given but not in US
                elif re.findall(r'[A-Z]{2}', s[i]) != []:
                    result += ["Foreign"]
                # Edge Case 5: Location cannot be identified from the given information
                else:
                    result += ["No Location"]
        return result

    result = extract_state(df["location"])
    df["state"] = result
    
    return df 
def OHE_processing(df, condition = "train"):
    """One Hot Encoding of all categorical variables"""
    """Input: Data Frame"""
    """Output: Data Frame of dummy variables"""
    
    if condition == "test":
        with open('./pickle/encoder_func.pkl', 'rb') as f:
            encoder_func = pickle.load(f)
        
        with open('./pickle/encoder_et.pkl', 'rb') as f:
            encoder_et = pickle.load(f)
            
        with open('./pickle/encoder_re.pkl', 'rb') as f:
            encoder_re = pickle.load(f)
            
        with open('./pickle/encoder_ind.pkl', 'rb') as f:
            encoder_ind = pickle.load(f)
        
        with open('./pickle/encoder_state.pkl', 'rb') as f:
            encoder_state = pickle.load(f)
            
        
        df["function"].fillna("NAN", inplace = True)
        df["employment_type"].fillna("NAN", inplace = True)
        df["required_experience"].fillna("NAN", inplace = True)
        df["industry"].fillna("NAN", inplace = True)
        df["state"].fillna("NAN", inplace = True)

        encode_function = encoder_func.transform(df[['function']])
        feature_name_func = encoder_func.get_feature_names_out()
        encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)

        encode_et = encoder_et.transform(df[['employment_type']])
        feature_name_et = encoder_et.get_feature_names_out()
        encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)

        encode_re = encoder_re.transform(df[['required_experience']])
        feature_name_re = encoder_re.get_feature_names_out()
        encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)

        encode_ind = encoder_ind.transform(df[['industry']])
        feature_name_ind = encoder_ind.get_feature_names_out()
        encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
        
        encode_state = encoder_state.transform(df[['state']])
        feature_name_state = encoder_state.get_feature_names_out()
        encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)

        ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
    
    else:   
        df["function"].fillna("NAN", inplace = True)
        df["employment_type"].fillna("NAN", inplace = True)
        df["required_experience"].fillna("NAN", inplace = True)
        df["industry"].fillna("NAN", inplace = True)
        df["state"].fillna("NAN", inplace = True)

        encoder_state = OneHotEncoder(handle_unknown = 'ignore')
        encode_state = encoder_state.fit_transform(df[['state']])
        feature_name_state = encoder_state.get_feature_names_out()
        encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
        
        encoder_func = OneHotEncoder(handle_unknown = 'ignore')
        encode_function = encoder_func.fit_transform(df[['function']])
        feature_name_func = encoder_func.get_feature_names_out()
        encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)

        encoder_et = OneHotEncoder(handle_unknown = 'ignore')
        encode_et = encoder_et.fit_transform(df[['employment_type']])
        feature_name_et = encoder_et.get_feature_names_out()
        encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)

        encoder_re = OneHotEncoder(handle_unknown = 'ignore')
        encode_re = encoder_re.fit_transform(df[['required_experience']])
        feature_name_re = encoder_re.get_feature_names_out()
        encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)

        encoder_ind = OneHotEncoder(handle_unknown = 'ignore')
        encode_ind = encoder_ind.fit_transform(df[['industry']])
        feature_name_ind = encoder_ind.get_feature_names_out()
        encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)

        ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
        
        with open('./pickle/encoder_func.pkl', 'wb') as f:
                pickle.dump(encoder_func, f)

        with open('./pickle/encoder_et.pkl', 'wb') as f:
            pickle.dump(encoder_et, f)

        with open('./pickle/encoder_re.pkl', 'wb') as f:
            pickle.dump(encoder_re, f)

        with open('./pickle/encoder_ind.pkl', 'wb') as f:
            pickle.dump(encoder_ind, f)
            
        with open('./pickle/encoder_state.pkl', 'wb') as f:
            pickle.dump(encoder_state, f)
    
    return ohe_feature
def final_processing(df):
    """ Delete Unused columns, binarization and location extraction"""
    """ Input: Data Frame """
    """ Output: processed Data Frame"""
        
    # Remove job_id, department, salary_range and location
    
    df = df.drop(['job_id', 'department', 'salary_range', 'location'], axis=1)
    df = df.drop(['description', 'requirements', 'title', 'benefits'], axis=1)
    df = df.drop(['function', 'employment_type', 'required_experience', 'industry', 'state'], axis=1)
    
    # binarize company_profile and required_education 
    
    bool_series_cp = pd.isnull(df["company_profile"])
    df["company_profile"][bool_series_cp] = 1
    df["company_profile"][~bool_series_cp] = 0 
    
    bool_series_re = pd.isnull(df["required_education"])
    df["required_education"][bool_series_re] = 1
    df["required_education"][~bool_series_re] = 0 
    
    return df

Please follow the following steps to process your data. Otherwise, you will face MemoryError.

train_data = pd.read_csv("./data/train_set.csv") #Load Data 
text_features_train = text_processing(train_data) # Text Processing (Take Long Time)
joblib.dump(text_features_train, './data/text_features_train_jlib') # Save it as jlib file 
train_data = location_processing(train_data) # add State to train data 
OHE_features_train = OHE_processing(train_data) # OHE
joblib.dump(OHE_features_train, './data/OHE_features_train_jlib') # Save OHE features as jlib file 
processed_train = final_processing(train_data) # Final Processing (will get several warning, ignore)
joblib.dump(processed_train, './data/processed_train_jlib') # Save processed train_data as jlib file

Combine these three files to get a whole matrix.

Here is how you unpack the joblib file:

text_features_train = joblib.load('./data/text_features_train_jlib')

Warning

Saving these file in csv or pickle can cause memory error because of its large file size. To store array-like object, joblib works better than pickle. Pickle is useful when we need to store non-array object like encoders and models.