Processing The Test Dataset¶

In this section, we will process our test dataset using the pipeline we made on the previous section.

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re
from sklearn.preprocessing import OneHotEncoder
import joblib

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
      2 import numpy as np
      3 import nltk

ModuleNotFoundError: No module named 'pandas'

def text_processing(df, condition = "train"):
    """Extract text features from columns"""
    """Input: Data frame """
    """Output: Data frame of extracted text features with their TF-IDF"""
    
    df["description"] = df["description"].str.lower()
    df["requirements"] = df["requirements"].str.lower()
    df["benefits"] = df["benefits"].str.lower()
    df["title"] = df["title"].str.lower()
    
    # Remove all NA 
    df["description"].fillna(" ", inplace = True)
    df["requirements"].fillna(" ", inplace = True)
    df["benefits"].fillna(" ", inplace = True)
    df["title"].fillna(" ", inplace = True)
    
    # Remove unnecessary words and punctuation / stemming
    
    stop = set(stopwords.words('english'))
    stop = list(stop)

    def remove_URL(text):
        url = re.compile(r'#url_\w*#')
        return url.sub(r'url ',str(text))

    def remove_emoji(text):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r' ', str(text))

    def remove_html(text):
        html=re.compile(r'<.*?>')
        return html.sub(r'html ',str(text))

    def remove_stopwords(words):
        return ' '.join(word for word in str(words).split() if word not in stop)

    def remove_punctuation(words):
        return ' '.join(word.strip(string.punctuation) for word in str(words).split())

    def remove_dirty_words(words):
        dirty_words=re.compile(r'[^\x00-\x7F]+|(&amp)|\d|[^\w\s]')
        return dirty_words.sub(r' ',str(words))
    
    def stemSentence(sentence):
        porter=PorterStemmer()
        token_words=word_tokenize(sentence)
        token_words
        stem_sentence=[]
        for word in token_words:
            stem_sentence.append(porter.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)
    
    df["description"] = df.description.apply(remove_URL)
    df["description"] = df.description.apply(remove_html)
    df["description"] = df.description.apply(remove_emoji)
    df["description"] = df.description.apply(remove_dirty_words)
    df["description"] = df.description.apply(remove_punctuation)
    df["description"] = df.description.apply(remove_stopwords)
    df["description"] = df.description.apply(stemSentence)
    
    df["title"] = df.title.apply(remove_URL)
    df["title"] = df.title.apply(remove_html)
    df["title"] = df.title.apply(remove_emoji)
    df["title"] = df.title.apply(remove_dirty_words)
    df["title"] = df.title.apply(remove_punctuation)
    df["title"] = df.title.apply(remove_stopwords)
    df["title"] = df.title.apply(stemSentence)
    
    df["benefits"] = df.benefits.apply(remove_URL)
    df["benefits"] = df.benefits.apply(remove_html)
    df["benefits"] = df.benefits.apply(remove_emoji)
    df["benefits"] = df.benefits.apply(remove_dirty_words)
    df["benefits"] = df.benefits.apply(remove_punctuation)
    df["benefits"] = df.benefits.apply(remove_stopwords)
    df["benefits"] = df.benefits.apply(stemSentence)
    
    df["requirements"] = df.requirements.apply(remove_URL)
    df["requirements"] = df.requirements.apply(remove_html)
    df["requirements"] = df.requirements.apply(remove_emoji)
    df["requirements"] = df.requirements.apply(remove_dirty_words)
    df["requirements"] = df.requirements.apply(remove_punctuation)
    df["requirements"] = df.requirements.apply(remove_stopwords)
    df["requirements"] = df.requirements.apply(stemSentence)
    
    
    # Feature Extraction
    
    if condition == "test":
        with open('./pickle/vec_description.pkl', 'rb') as f:
            vec_description = pickle.load(f)
        
        with open('./pickle/vec_title.pkl', 'rb') as f:
            vec_title = pickle.load(f)
            
        with open('./pickle/vec_benefits.pkl', 'rb') as f:
            vec_benefits = pickle.load(f)
            
        with open('./pickle/vec_requirements.pkl', 'rb') as f:
            vec_requirements = pickle.load(f)
        
        tfidf_description = vec_description.transform(df["description"])
        features_name_desc = vec_description.get_feature_names_out() + "_desc"
        desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)

        tfidf_requirements = vec_requirements.transform(df["requirements"])
        features_name_req = vec_requirements.get_feature_names_out() + "_req"
        req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)

        tfidf_title = vec_title.transform(df["title"])
        features_name_title = vec_title.get_feature_names_out() + "_title"
        title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
        
        tfidf_benefits = vec_benefits.transform(df["benefits"])
        features_name_benefits = vec_benefits.get_feature_names_out() +"_benefits"
        benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
        
        feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
        
    else:

        vec_description = TfidfVectorizer(smooth_idf=True)
        tfidf_description = vec_description.fit_transform(df["description"])
        features_name_desc = vec_description.get_feature_names_out() + "_desc"
        desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)

        vec_requirements = TfidfVectorizer(smooth_idf=True)
        tfidf_requirements = vec_requirements.fit_transform(df["requirements"])
        features_name_req = vec_requirements.get_feature_names_out() + "_req"
        req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)

        vec_title = TfidfVectorizer(smooth_idf=True)
        tfidf_title = vec_title.fit_transform(df["title"])
        features_name_title = vec_title.get_feature_names_out() + "_title"
        title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)

        vec_benefits = TfidfVectorizer(smooth_idf=True)
        tfidf_benefits = vec_benefits.fit_transform(df["benefits"])
        features_name_benefits = vec_benefits.get_feature_names_out() + "_benefits"
        benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
        
        feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
        
        with open('./pickle/vec_description.pkl', 'wb') as f:
            pickle.dump(vec_description, f)
        
        with open('./pickle/vec_title.pkl', 'wb') as f:
            pickle.dump(vec_title, f)
            
        with open('./pickle/vec_benefits.pkl', 'wb') as f:
            pickle.dump(vec_benefits, f)
            
        with open('./pickle/vec_requirements.pkl', 'wb') as f:
            pickle.dump(vec_requirements, f)
        
    return feature_text

def location_processing(df):
    def extract_state(s):
        """ Extract state from the location"""
        """ The function can be used only when the state is formmated with two capital letter"""
        """ Input: Series, iterable object"""
        """ Output: List of States"""
    
        s.fillna("No Location", inplace = True)
        result = []    
        for i in np.arange(len(s)):
            if (s[i].__contains__("US")):
                extracted = re.findall(r'[A-Z]{2}', re.sub(r'[US]','',s[i]))
                # Edge Case 1: Posting is from US but State is not posted
                if extracted == []:
                    extracted = ["Domestic"]
                # Edge Case 2: Regex detect a city name as a State name
                if len(extracted) != 1:
                    while len(extracted) > 1:
                        extracted.pop()
                result += extracted
            else:
                # Edge Case 3: Location is not given 
                if s[i] == ["No Location"]:
                    result += s[i]
                # Edge Case 4: Location is given but not in US
                elif re.findall(r'[A-Z]{2}', s[i]) != []:
                    result += ["Foreign"]
                # Edge Case 5: Location cannot be identified from the given information
                else:
                    result += ["No Location"]
        return result

    result = extract_state(df["location"])
    df["state"] = result
    
    return df 

def OHE_processing(df, condition = "train"):
    """One Hot Encoding of all categorical variables"""
    """Input: Data Frame"""
    """Output: Data Frame of dummy variables"""
    
    if condition == "test":
        with open('./pickle/encoder_func.pkl', 'rb') as f:
            encoder_func = pickle.load(f)
        
        with open('./pickle/encoder_et.pkl', 'rb') as f:
            encoder_et = pickle.load(f)
            
        with open('./pickle/encoder_re.pkl', 'rb') as f:
            encoder_re = pickle.load(f)
            
        with open('./pickle/encoder_ind.pkl', 'rb') as f:
            encoder_ind = pickle.load(f)
        
        with open('./pickle/encoder_state.pkl', 'rb') as f:
            encoder_state = pickle.load(f)
            
        
        df["function"].fillna("NAN", inplace = True)
        df["employment_type"].fillna("NAN", inplace = True)
        df["required_experience"].fillna("NAN", inplace = True)
        df["industry"].fillna("NAN", inplace = True)
        df["state"].fillna("NAN", inplace = True)

        encode_function = encoder_func.transform(df[['function']])
        feature_name_func = encoder_func.get_feature_names_out()
        encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)

        encode_et = encoder_et.transform(df[['employment_type']])
        feature_name_et = encoder_et.get_feature_names_out()
        encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)

        encode_re = encoder_re.transform(df[['required_experience']])
        feature_name_re = encoder_re.get_feature_names_out()
        encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)

        encode_ind = encoder_ind.transform(df[['industry']])
        feature_name_ind = encoder_ind.get_feature_names_out()
        encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
        
        encode_state = encoder_state.transform(df[['state']])
        feature_name_state = encoder_state.get_feature_names_out()
        encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)

        ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
    
    else:   
        df["function"].fillna("NAN", inplace = True)
        df["employment_type"].fillna("NAN", inplace = True)
        df["required_experience"].fillna("NAN", inplace = True)
        df["industry"].fillna("NAN", inplace = True)
        df["state"].fillna("NAN", inplace = True)

        encoder_state = OneHotEncoder(handle_unknown = 'ignore')
        encode_state = encoder_state.fit_transform(df[['state']])
        feature_name_state = encoder_state.get_feature_names_out()
        encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
        
        encoder_func = OneHotEncoder(handle_unknown = 'ignore')
        encode_function = encoder_func.fit_transform(df[['function']])
        feature_name_func = encoder_func.get_feature_names_out()
        encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)

        encoder_et = OneHotEncoder(handle_unknown = 'ignore')
        encode_et = encoder_et.fit_transform(df[['employment_type']])
        feature_name_et = encoder_et.get_feature_names_out()
        encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)

        encoder_re = OneHotEncoder(handle_unknown = 'ignore')
        encode_re = encoder_re.fit_transform(df[['required_experience']])
        feature_name_re = encoder_re.get_feature_names_out()
        encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)

        encoder_ind = OneHotEncoder(handle_unknown = 'ignore')
        encode_ind = encoder_ind.fit_transform(df[['industry']])
        feature_name_ind = encoder_ind.get_feature_names_out()
        encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)

        ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
        
        with open('./pickle/encoder_func.pkl', 'wb') as f:
                pickle.dump(encoder_func, f)

        with open('./pickle/encoder_et.pkl', 'wb') as f:
            pickle.dump(encoder_et, f)

        with open('./pickle/encoder_re.pkl', 'wb') as f:
            pickle.dump(encoder_re, f)

        with open('./pickle/encoder_ind.pkl', 'wb') as f:
            pickle.dump(encoder_ind, f)
            
        with open('./pickle/encoder_state.pkl', 'wb') as f:
            pickle.dump(encoder_state, f)
    
    return ohe_feature

def final_processing(df):
    """ Delete Unused columns, binarization and location extraction"""
    """ Input: Data Frame """
    """ Output: processed Data Frame"""
        
    # Remove job_id, department, salary_range and location
    
    df = df.drop(['job_id', 'department', 'salary_range', 'location'], axis=1)
    df = df.drop(['description', 'requirements', 'title', 'benefits'], axis=1)
    df = df.drop(['function', 'employment_type', 'required_experience', 'industry', 'state'], axis=1)
    
    # binarize company_profile and required_education 
    
    bool_series_cp = pd.isnull(df["company_profile"])
    df["company_profile"][bool_series_cp] = 1
    df["company_profile"][~bool_series_cp] = 0 
    
    bool_series_re = pd.isnull(df["required_education"])
    df["required_education"][bool_series_re] = 1
    df["required_education"][~bool_series_re] = 0 
    
    return df

test_data = pd.read_csv("./data/test_set.csv", index_col = 0)
test_data = test_data.iloc[:, 1:]

text_features_test = text_processing(test_data, condition = "test")

test_data = location_processing(test_data)

OHE_features_test = OHE_processing(test_data, condition = "test")

processed_test = final_processing(test_data)

C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["company_profile"][bool_series_cp] = 1
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["company_profile"][~bool_series_cp] = 0
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["required_education"][bool_series_re] = 1
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["required_education"][~bool_series_re] = 0

processed_test = pd.concat([text_features_test, OHE_features_test, processed_test], axis = 1)

We will choose the columns that we selected from feature selection.

train_features = pd.read_csv("./data/final_feature_matrix.csv", index_col = 0)

final_feature_matrix_test = processed_test.loc[:, train_features.columns]

final_feature_matrix_test

	administr_desc	answer_desc	assist_desc	bill_desc	call_desc	cash_desc	desir_desc	duti_desc	earn_desc	entri_desc	...	industry_Accounting	industry_Leisure, Travel & Tourism	industry_NAN	industry_Oil & Energy	company_profile	telecommuting	has_company_logo	has_questions	required_education	fraudulent
0	0.000000	0.000000	0.044915	0.0	0.000000	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	1	0	1	0	1	0
1	0.000000	0.000000	0.000000	0.0	0.054131	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0	0	1	1	0	0
2	0.211551	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.057114	0.0	0.0	...	0.0	0.0	0.0	0.0	0	0	1	0	0	0
3	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0	0	1	1	1	0
4	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	0.0	1.0	0	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3571	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	1.0	0.0	1	0	0	0	1	0
3572	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	1.0	0.0	0	0	1	0	1	0
3573	0.000000	0.142424	0.000000	0.0	0.000000	0.0	0.072183	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0	0	1	1	1	0
3574	0.000000	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.000000	0.0	0.0	...	0.0	0.0	1.0	0.0	0	0	1	1	1	0
3575	0.000000	0.000000	0.035608	0.0	0.000000	0.0	0.053655	0.000000	0.0	0.0	...	0.0	0.0	1.0	0.0	0	0	1	1	1	0

3576 rows × 86 columns

final_feature_matrix_test.to_csv("./data/final_feature_matrix_test.csv")

Classifying Fake Job Posting Using Machine Learning Algorithm

Processing The Test Dataset

Processing The Test Dataset¶