Processing The Test Dataset

Processing The Test Dataset

In this section, we will process our test dataset using the pipeline we made on the previous section.

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re
from sklearn.preprocessing import OneHotEncoder
import joblib

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
      2 import numpy as np
      3 import nltk

ModuleNotFoundError: No module named 'pandas'
def text_processing(df, condition = "train"):
    """Extract text features from columns"""
    """Input: Data frame """
    """Output: Data frame of extracted text features with their TF-IDF"""
    
    df["description"] = df["description"].str.lower()
    df["requirements"] = df["requirements"].str.lower()
    df["benefits"] = df["benefits"].str.lower()
    df["title"] = df["title"].str.lower()
    
    # Remove all NA 
    df["description"].fillna(" ", inplace = True)
    df["requirements"].fillna(" ", inplace = True)
    df["benefits"].fillna(" ", inplace = True)
    df["title"].fillna(" ", inplace = True)
    
    # Remove unnecessary words and punctuation / stemming
    
    stop = set(stopwords.words('english'))
    stop = list(stop)

    def remove_URL(text):
        url = re.compile(r'#url_\w*#')
        return url.sub(r'url ',str(text))

    def remove_emoji(text):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r' ', str(text))

    def remove_html(text):
        html=re.compile(r'<.*?>')
        return html.sub(r'html ',str(text))

    def remove_stopwords(words):
        return ' '.join(word for word in str(words).split() if word not in stop)

    def remove_punctuation(words):
        return ' '.join(word.strip(string.punctuation) for word in str(words).split())

    def remove_dirty_words(words):
        dirty_words=re.compile(r'[^\x00-\x7F]+|(&amp)|\d|[^\w\s]')
        return dirty_words.sub(r' ',str(words))
    
    def stemSentence(sentence):
        porter=PorterStemmer()
        token_words=word_tokenize(sentence)
        token_words
        stem_sentence=[]
        for word in token_words:
            stem_sentence.append(porter.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)
    
    df["description"] = df.description.apply(remove_URL)
    df["description"] = df.description.apply(remove_html)
    df["description"] = df.description.apply(remove_emoji)
    df["description"] = df.description.apply(remove_dirty_words)
    df["description"] = df.description.apply(remove_punctuation)
    df["description"] = df.description.apply(remove_stopwords)
    df["description"] = df.description.apply(stemSentence)
    
    df["title"] = df.title.apply(remove_URL)
    df["title"] = df.title.apply(remove_html)
    df["title"] = df.title.apply(remove_emoji)
    df["title"] = df.title.apply(remove_dirty_words)
    df["title"] = df.title.apply(remove_punctuation)
    df["title"] = df.title.apply(remove_stopwords)
    df["title"] = df.title.apply(stemSentence)
    
    df["benefits"] = df.benefits.apply(remove_URL)
    df["benefits"] = df.benefits.apply(remove_html)
    df["benefits"] = df.benefits.apply(remove_emoji)
    df["benefits"] = df.benefits.apply(remove_dirty_words)
    df["benefits"] = df.benefits.apply(remove_punctuation)
    df["benefits"] = df.benefits.apply(remove_stopwords)
    df["benefits"] = df.benefits.apply(stemSentence)
    
    df["requirements"] = df.requirements.apply(remove_URL)
    df["requirements"] = df.requirements.apply(remove_html)
    df["requirements"] = df.requirements.apply(remove_emoji)
    df["requirements"] = df.requirements.apply(remove_dirty_words)
    df["requirements"] = df.requirements.apply(remove_punctuation)
    df["requirements"] = df.requirements.apply(remove_stopwords)
    df["requirements"] = df.requirements.apply(stemSentence)
    
    
    # Feature Extraction
    
    if condition == "test":
        with open('./pickle/vec_description.pkl', 'rb') as f:
            vec_description = pickle.load(f)
        
        with open('./pickle/vec_title.pkl', 'rb') as f:
            vec_title = pickle.load(f)
            
        with open('./pickle/vec_benefits.pkl', 'rb') as f:
            vec_benefits = pickle.load(f)
            
        with open('./pickle/vec_requirements.pkl', 'rb') as f:
            vec_requirements = pickle.load(f)
        
        tfidf_description = vec_description.transform(df["description"])
        features_name_desc = vec_description.get_feature_names_out() + "_desc"
        desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)

        tfidf_requirements = vec_requirements.transform(df["requirements"])
        features_name_req = vec_requirements.get_feature_names_out() + "_req"
        req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)

        tfidf_title = vec_title.transform(df["title"])
        features_name_title = vec_title.get_feature_names_out() + "_title"
        title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
        
        tfidf_benefits = vec_benefits.transform(df["benefits"])
        features_name_benefits = vec_benefits.get_feature_names_out() +"_benefits"
        benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
        
        feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
        
    else:

        vec_description = TfidfVectorizer(smooth_idf=True)
        tfidf_description = vec_description.fit_transform(df["description"])
        features_name_desc = vec_description.get_feature_names_out() + "_desc"
        desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)

        vec_requirements = TfidfVectorizer(smooth_idf=True)
        tfidf_requirements = vec_requirements.fit_transform(df["requirements"])
        features_name_req = vec_requirements.get_feature_names_out() + "_req"
        req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)

        vec_title = TfidfVectorizer(smooth_idf=True)
        tfidf_title = vec_title.fit_transform(df["title"])
        features_name_title = vec_title.get_feature_names_out() + "_title"
        title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)

        vec_benefits = TfidfVectorizer(smooth_idf=True)
        tfidf_benefits = vec_benefits.fit_transform(df["benefits"])
        features_name_benefits = vec_benefits.get_feature_names_out() + "_benefits"
        benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
        
        feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
        
        with open('./pickle/vec_description.pkl', 'wb') as f:
            pickle.dump(vec_description, f)
        
        with open('./pickle/vec_title.pkl', 'wb') as f:
            pickle.dump(vec_title, f)
            
        with open('./pickle/vec_benefits.pkl', 'wb') as f:
            pickle.dump(vec_benefits, f)
            
        with open('./pickle/vec_requirements.pkl', 'wb') as f:
            pickle.dump(vec_requirements, f)
        
    return feature_text
def location_processing(df):
    def extract_state(s):
        """ Extract state from the location"""
        """ The function can be used only when the state is formmated with two capital letter"""
        """ Input: Series, iterable object"""
        """ Output: List of States"""
    
        s.fillna("No Location", inplace = True)
        result = []    
        for i in np.arange(len(s)):
            if (s[i].__contains__("US")):
                extracted = re.findall(r'[A-Z]{2}', re.sub(r'[US]','',s[i]))
                # Edge Case 1: Posting is from US but State is not posted
                if extracted == []:
                    extracted = ["Domestic"]
                # Edge Case 2: Regex detect a city name as a State name
                if len(extracted) != 1:
                    while len(extracted) > 1:
                        extracted.pop()
                result += extracted
            else:
                # Edge Case 3: Location is not given 
                if s[i] == ["No Location"]:
                    result += s[i]
                # Edge Case 4: Location is given but not in US
                elif re.findall(r'[A-Z]{2}', s[i]) != []:
                    result += ["Foreign"]
                # Edge Case 5: Location cannot be identified from the given information
                else:
                    result += ["No Location"]
        return result

    result = extract_state(df["location"])
    df["state"] = result
    
    return df 
def OHE_processing(df, condition = "train"):
    """One Hot Encoding of all categorical variables"""
    """Input: Data Frame"""
    """Output: Data Frame of dummy variables"""
    
    if condition == "test":
        with open('./pickle/encoder_func.pkl', 'rb') as f:
            encoder_func = pickle.load(f)
        
        with open('./pickle/encoder_et.pkl', 'rb') as f:
            encoder_et = pickle.load(f)
            
        with open('./pickle/encoder_re.pkl', 'rb') as f:
            encoder_re = pickle.load(f)
            
        with open('./pickle/encoder_ind.pkl', 'rb') as f:
            encoder_ind = pickle.load(f)
        
        with open('./pickle/encoder_state.pkl', 'rb') as f:
            encoder_state = pickle.load(f)
            
        
        df["function"].fillna("NAN", inplace = True)
        df["employment_type"].fillna("NAN", inplace = True)
        df["required_experience"].fillna("NAN", inplace = True)
        df["industry"].fillna("NAN", inplace = True)
        df["state"].fillna("NAN", inplace = True)

        encode_function = encoder_func.transform(df[['function']])
        feature_name_func = encoder_func.get_feature_names_out()
        encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)

        encode_et = encoder_et.transform(df[['employment_type']])
        feature_name_et = encoder_et.get_feature_names_out()
        encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)

        encode_re = encoder_re.transform(df[['required_experience']])
        feature_name_re = encoder_re.get_feature_names_out()
        encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)

        encode_ind = encoder_ind.transform(df[['industry']])
        feature_name_ind = encoder_ind.get_feature_names_out()
        encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
        
        encode_state = encoder_state.transform(df[['state']])
        feature_name_state = encoder_state.get_feature_names_out()
        encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)

        ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
    
    else:   
        df["function"].fillna("NAN", inplace = True)
        df["employment_type"].fillna("NAN", inplace = True)
        df["required_experience"].fillna("NAN", inplace = True)
        df["industry"].fillna("NAN", inplace = True)
        df["state"].fillna("NAN", inplace = True)

        encoder_state = OneHotEncoder(handle_unknown = 'ignore')
        encode_state = encoder_state.fit_transform(df[['state']])
        feature_name_state = encoder_state.get_feature_names_out()
        encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
        
        encoder_func = OneHotEncoder(handle_unknown = 'ignore')
        encode_function = encoder_func.fit_transform(df[['function']])
        feature_name_func = encoder_func.get_feature_names_out()
        encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)

        encoder_et = OneHotEncoder(handle_unknown = 'ignore')
        encode_et = encoder_et.fit_transform(df[['employment_type']])
        feature_name_et = encoder_et.get_feature_names_out()
        encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)

        encoder_re = OneHotEncoder(handle_unknown = 'ignore')
        encode_re = encoder_re.fit_transform(df[['required_experience']])
        feature_name_re = encoder_re.get_feature_names_out()
        encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)

        encoder_ind = OneHotEncoder(handle_unknown = 'ignore')
        encode_ind = encoder_ind.fit_transform(df[['industry']])
        feature_name_ind = encoder_ind.get_feature_names_out()
        encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)

        ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
        
        with open('./pickle/encoder_func.pkl', 'wb') as f:
                pickle.dump(encoder_func, f)

        with open('./pickle/encoder_et.pkl', 'wb') as f:
            pickle.dump(encoder_et, f)

        with open('./pickle/encoder_re.pkl', 'wb') as f:
            pickle.dump(encoder_re, f)

        with open('./pickle/encoder_ind.pkl', 'wb') as f:
            pickle.dump(encoder_ind, f)
            
        with open('./pickle/encoder_state.pkl', 'wb') as f:
            pickle.dump(encoder_state, f)
    
    return ohe_feature
def final_processing(df):
    """ Delete Unused columns, binarization and location extraction"""
    """ Input: Data Frame """
    """ Output: processed Data Frame"""
        
    # Remove job_id, department, salary_range and location
    
    df = df.drop(['job_id', 'department', 'salary_range', 'location'], axis=1)
    df = df.drop(['description', 'requirements', 'title', 'benefits'], axis=1)
    df = df.drop(['function', 'employment_type', 'required_experience', 'industry', 'state'], axis=1)
    
    # binarize company_profile and required_education 
    
    bool_series_cp = pd.isnull(df["company_profile"])
    df["company_profile"][bool_series_cp] = 1
    df["company_profile"][~bool_series_cp] = 0 
    
    bool_series_re = pd.isnull(df["required_education"])
    df["required_education"][bool_series_re] = 1
    df["required_education"][~bool_series_re] = 0 
    
    return df
test_data = pd.read_csv("./data/test_set.csv", index_col = 0)
test_data = test_data.iloc[:, 1:]
text_features_test = text_processing(test_data, condition = "test")
test_data = location_processing(test_data)
OHE_features_test = OHE_processing(test_data, condition = "test")
processed_test = final_processing(test_data)
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["company_profile"][bool_series_cp] = 1
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["company_profile"][~bool_series_cp] = 0
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["required_education"][bool_series_re] = 1
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["required_education"][~bool_series_re] = 0
processed_test = pd.concat([text_features_test, OHE_features_test, processed_test], axis = 1)

We will choose the columns that we selected from feature selection.

train_features = pd.read_csv("./data/final_feature_matrix.csv", index_col = 0)
final_feature_matrix_test = processed_test.loc[:, train_features.columns]
final_feature_matrix_test
administr_desc answer_desc assist_desc bill_desc call_desc cash_desc desir_desc duti_desc earn_desc entri_desc ... industry_Accounting industry_Leisure, Travel & Tourism industry_NAN industry_Oil & Energy company_profile telecommuting has_company_logo has_questions required_education fraudulent
0 0.000000 0.000000 0.044915 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 1 0 1 0 1 0
1 0.000000 0.000000 0.000000 0.0 0.054131 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0 0 1 1 0 0
2 0.211551 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.057114 0.0 0.0 ... 0.0 0.0 0.0 0.0 0 0 1 0 0 0
3 0.000000 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0 0 1 1 1 0
4 0.000000 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 0.0 1.0 0 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3571 0.000000 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 1.0 0.0 1 0 0 0 1 0
3572 0.000000 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0 0 1 0 1 0
3573 0.000000 0.142424 0.000000 0.0 0.000000 0.0 0.072183 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0 0 1 1 1 0
3574 0.000000 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0 0 1 1 1 0
3575 0.000000 0.000000 0.035608 0.0 0.000000 0.0 0.053655 0.000000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0 0 1 1 1 0

3576 rows × 86 columns

final_feature_matrix_test.to_csv("./data/final_feature_matrix_test.csv")