Processing The Test Dataset
Processing The Test Dataset¶
In this section, we will process our test dataset using the pipeline we made on the previous section.
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re
from sklearn.preprocessing import OneHotEncoder
import joblib
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
2 import numpy as np
3 import nltk
ModuleNotFoundError: No module named 'pandas'
def text_processing(df, condition = "train"):
"""Extract text features from columns"""
"""Input: Data frame """
"""Output: Data frame of extracted text features with their TF-IDF"""
df["description"] = df["description"].str.lower()
df["requirements"] = df["requirements"].str.lower()
df["benefits"] = df["benefits"].str.lower()
df["title"] = df["title"].str.lower()
# Remove all NA
df["description"].fillna(" ", inplace = True)
df["requirements"].fillna(" ", inplace = True)
df["benefits"].fillna(" ", inplace = True)
df["title"].fillna(" ", inplace = True)
# Remove unnecessary words and punctuation / stemming
stop = set(stopwords.words('english'))
stop = list(stop)
def remove_URL(text):
url = re.compile(r'#url_\w*#')
return url.sub(r'url ',str(text))
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', str(text))
def remove_html(text):
html=re.compile(r'<.*?>')
return html.sub(r'html ',str(text))
def remove_stopwords(words):
return ' '.join(word for word in str(words).split() if word not in stop)
def remove_punctuation(words):
return ' '.join(word.strip(string.punctuation) for word in str(words).split())
def remove_dirty_words(words):
dirty_words=re.compile(r'[^\x00-\x7F]+|(&)|\d|[^\w\s]')
return dirty_words.sub(r' ',str(words))
def stemSentence(sentence):
porter=PorterStemmer()
token_words=word_tokenize(sentence)
token_words
stem_sentence=[]
for word in token_words:
stem_sentence.append(porter.stem(word))
stem_sentence.append(" ")
return "".join(stem_sentence)
df["description"] = df.description.apply(remove_URL)
df["description"] = df.description.apply(remove_html)
df["description"] = df.description.apply(remove_emoji)
df["description"] = df.description.apply(remove_dirty_words)
df["description"] = df.description.apply(remove_punctuation)
df["description"] = df.description.apply(remove_stopwords)
df["description"] = df.description.apply(stemSentence)
df["title"] = df.title.apply(remove_URL)
df["title"] = df.title.apply(remove_html)
df["title"] = df.title.apply(remove_emoji)
df["title"] = df.title.apply(remove_dirty_words)
df["title"] = df.title.apply(remove_punctuation)
df["title"] = df.title.apply(remove_stopwords)
df["title"] = df.title.apply(stemSentence)
df["benefits"] = df.benefits.apply(remove_URL)
df["benefits"] = df.benefits.apply(remove_html)
df["benefits"] = df.benefits.apply(remove_emoji)
df["benefits"] = df.benefits.apply(remove_dirty_words)
df["benefits"] = df.benefits.apply(remove_punctuation)
df["benefits"] = df.benefits.apply(remove_stopwords)
df["benefits"] = df.benefits.apply(stemSentence)
df["requirements"] = df.requirements.apply(remove_URL)
df["requirements"] = df.requirements.apply(remove_html)
df["requirements"] = df.requirements.apply(remove_emoji)
df["requirements"] = df.requirements.apply(remove_dirty_words)
df["requirements"] = df.requirements.apply(remove_punctuation)
df["requirements"] = df.requirements.apply(remove_stopwords)
df["requirements"] = df.requirements.apply(stemSentence)
# Feature Extraction
if condition == "test":
with open('./pickle/vec_description.pkl', 'rb') as f:
vec_description = pickle.load(f)
with open('./pickle/vec_title.pkl', 'rb') as f:
vec_title = pickle.load(f)
with open('./pickle/vec_benefits.pkl', 'rb') as f:
vec_benefits = pickle.load(f)
with open('./pickle/vec_requirements.pkl', 'rb') as f:
vec_requirements = pickle.load(f)
tfidf_description = vec_description.transform(df["description"])
features_name_desc = vec_description.get_feature_names_out() + "_desc"
desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)
tfidf_requirements = vec_requirements.transform(df["requirements"])
features_name_req = vec_requirements.get_feature_names_out() + "_req"
req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)
tfidf_title = vec_title.transform(df["title"])
features_name_title = vec_title.get_feature_names_out() + "_title"
title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
tfidf_benefits = vec_benefits.transform(df["benefits"])
features_name_benefits = vec_benefits.get_feature_names_out() +"_benefits"
benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
else:
vec_description = TfidfVectorizer(smooth_idf=True)
tfidf_description = vec_description.fit_transform(df["description"])
features_name_desc = vec_description.get_feature_names_out() + "_desc"
desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)
vec_requirements = TfidfVectorizer(smooth_idf=True)
tfidf_requirements = vec_requirements.fit_transform(df["requirements"])
features_name_req = vec_requirements.get_feature_names_out() + "_req"
req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)
vec_title = TfidfVectorizer(smooth_idf=True)
tfidf_title = vec_title.fit_transform(df["title"])
features_name_title = vec_title.get_feature_names_out() + "_title"
title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
vec_benefits = TfidfVectorizer(smooth_idf=True)
tfidf_benefits = vec_benefits.fit_transform(df["benefits"])
features_name_benefits = vec_benefits.get_feature_names_out() + "_benefits"
benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
with open('./pickle/vec_description.pkl', 'wb') as f:
pickle.dump(vec_description, f)
with open('./pickle/vec_title.pkl', 'wb') as f:
pickle.dump(vec_title, f)
with open('./pickle/vec_benefits.pkl', 'wb') as f:
pickle.dump(vec_benefits, f)
with open('./pickle/vec_requirements.pkl', 'wb') as f:
pickle.dump(vec_requirements, f)
return feature_text
def location_processing(df):
def extract_state(s):
""" Extract state from the location"""
""" The function can be used only when the state is formmated with two capital letter"""
""" Input: Series, iterable object"""
""" Output: List of States"""
s.fillna("No Location", inplace = True)
result = []
for i in np.arange(len(s)):
if (s[i].__contains__("US")):
extracted = re.findall(r'[A-Z]{2}', re.sub(r'[US]','',s[i]))
# Edge Case 1: Posting is from US but State is not posted
if extracted == []:
extracted = ["Domestic"]
# Edge Case 2: Regex detect a city name as a State name
if len(extracted) != 1:
while len(extracted) > 1:
extracted.pop()
result += extracted
else:
# Edge Case 3: Location is not given
if s[i] == ["No Location"]:
result += s[i]
# Edge Case 4: Location is given but not in US
elif re.findall(r'[A-Z]{2}', s[i]) != []:
result += ["Foreign"]
# Edge Case 5: Location cannot be identified from the given information
else:
result += ["No Location"]
return result
result = extract_state(df["location"])
df["state"] = result
return df
def OHE_processing(df, condition = "train"):
"""One Hot Encoding of all categorical variables"""
"""Input: Data Frame"""
"""Output: Data Frame of dummy variables"""
if condition == "test":
with open('./pickle/encoder_func.pkl', 'rb') as f:
encoder_func = pickle.load(f)
with open('./pickle/encoder_et.pkl', 'rb') as f:
encoder_et = pickle.load(f)
with open('./pickle/encoder_re.pkl', 'rb') as f:
encoder_re = pickle.load(f)
with open('./pickle/encoder_ind.pkl', 'rb') as f:
encoder_ind = pickle.load(f)
with open('./pickle/encoder_state.pkl', 'rb') as f:
encoder_state = pickle.load(f)
df["function"].fillna("NAN", inplace = True)
df["employment_type"].fillna("NAN", inplace = True)
df["required_experience"].fillna("NAN", inplace = True)
df["industry"].fillna("NAN", inplace = True)
df["state"].fillna("NAN", inplace = True)
encode_function = encoder_func.transform(df[['function']])
feature_name_func = encoder_func.get_feature_names_out()
encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)
encode_et = encoder_et.transform(df[['employment_type']])
feature_name_et = encoder_et.get_feature_names_out()
encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)
encode_re = encoder_re.transform(df[['required_experience']])
feature_name_re = encoder_re.get_feature_names_out()
encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)
encode_ind = encoder_ind.transform(df[['industry']])
feature_name_ind = encoder_ind.get_feature_names_out()
encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
encode_state = encoder_state.transform(df[['state']])
feature_name_state = encoder_state.get_feature_names_out()
encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
else:
df["function"].fillna("NAN", inplace = True)
df["employment_type"].fillna("NAN", inplace = True)
df["required_experience"].fillna("NAN", inplace = True)
df["industry"].fillna("NAN", inplace = True)
df["state"].fillna("NAN", inplace = True)
encoder_state = OneHotEncoder(handle_unknown = 'ignore')
encode_state = encoder_state.fit_transform(df[['state']])
feature_name_state = encoder_state.get_feature_names_out()
encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
encoder_func = OneHotEncoder(handle_unknown = 'ignore')
encode_function = encoder_func.fit_transform(df[['function']])
feature_name_func = encoder_func.get_feature_names_out()
encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)
encoder_et = OneHotEncoder(handle_unknown = 'ignore')
encode_et = encoder_et.fit_transform(df[['employment_type']])
feature_name_et = encoder_et.get_feature_names_out()
encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)
encoder_re = OneHotEncoder(handle_unknown = 'ignore')
encode_re = encoder_re.fit_transform(df[['required_experience']])
feature_name_re = encoder_re.get_feature_names_out()
encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)
encoder_ind = OneHotEncoder(handle_unknown = 'ignore')
encode_ind = encoder_ind.fit_transform(df[['industry']])
feature_name_ind = encoder_ind.get_feature_names_out()
encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
with open('./pickle/encoder_func.pkl', 'wb') as f:
pickle.dump(encoder_func, f)
with open('./pickle/encoder_et.pkl', 'wb') as f:
pickle.dump(encoder_et, f)
with open('./pickle/encoder_re.pkl', 'wb') as f:
pickle.dump(encoder_re, f)
with open('./pickle/encoder_ind.pkl', 'wb') as f:
pickle.dump(encoder_ind, f)
with open('./pickle/encoder_state.pkl', 'wb') as f:
pickle.dump(encoder_state, f)
return ohe_feature
def final_processing(df):
""" Delete Unused columns, binarization and location extraction"""
""" Input: Data Frame """
""" Output: processed Data Frame"""
# Remove job_id, department, salary_range and location
df = df.drop(['job_id', 'department', 'salary_range', 'location'], axis=1)
df = df.drop(['description', 'requirements', 'title', 'benefits'], axis=1)
df = df.drop(['function', 'employment_type', 'required_experience', 'industry', 'state'], axis=1)
# binarize company_profile and required_education
bool_series_cp = pd.isnull(df["company_profile"])
df["company_profile"][bool_series_cp] = 1
df["company_profile"][~bool_series_cp] = 0
bool_series_re = pd.isnull(df["required_education"])
df["required_education"][bool_series_re] = 1
df["required_education"][~bool_series_re] = 0
return df
test_data = pd.read_csv("./data/test_set.csv", index_col = 0)
test_data = test_data.iloc[:, 1:]
text_features_test = text_processing(test_data, condition = "test")
test_data = location_processing(test_data)
OHE_features_test = OHE_processing(test_data, condition = "test")
processed_test = final_processing(test_data)
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:15: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["company_profile"][bool_series_cp] = 1
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:16: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["company_profile"][~bool_series_cp] = 0
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["required_education"][bool_series_re] = 1
C:\Users\isaac\AppData\Local\Temp\ipykernel_16048\3958551616.py:20: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["required_education"][~bool_series_re] = 0
processed_test = pd.concat([text_features_test, OHE_features_test, processed_test], axis = 1)
We will choose the columns that we selected from feature selection.
train_features = pd.read_csv("./data/final_feature_matrix.csv", index_col = 0)
final_feature_matrix_test = processed_test.loc[:, train_features.columns]
final_feature_matrix_test
administr_desc | answer_desc | assist_desc | bill_desc | call_desc | cash_desc | desir_desc | duti_desc | earn_desc | entri_desc | ... | industry_Accounting | industry_Leisure, Travel & Tourism | industry_NAN | industry_Oil & Energy | company_profile | telecommuting | has_company_logo | has_questions | required_education | fraudulent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0.044915 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | 1 | 0 | 1 | 0 |
1 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.054131 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | 1 | 0 | 0 |
2 | 0.211551 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.057114 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | 1 | 1 | 0 |
4 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 1 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3571 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 |
3572 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 1 | 0 | 1 | 0 |
3573 | 0.000000 | 0.142424 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.072183 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | 1 | 1 | 0 |
3574 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 1 | 1 | 1 | 0 |
3575 | 0.000000 | 0.000000 | 0.035608 | 0.0 | 0.000000 | 0.0 | 0.053655 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 1 | 1 | 1 | 0 |
3576 rows × 86 columns
final_feature_matrix_test.to_csv("./data/final_feature_matrix_test.csv")