Creating Pipeline
Creating Pipeline¶
In this section, we will combine everything we discussed in “Feature Creation” and “Feature Selection”, and make a complete pipeline for preprocessing. This will be extremely useful when it comes to processing test set.
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re
from sklearn.preprocessing import OneHotEncoder
import joblib
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
2 import numpy as np
3 import nltk
ModuleNotFoundError: No module named 'pandas'
def text_processing(df, condition = "train"):
"""Extract text features from columns"""
"""Input: Data frame """
"""Output: Data frame of extracted text features with their TF-IDF"""
df["description"] = df["description"].str.lower()
df["requirements"] = df["requirements"].str.lower()
df["benefits"] = df["benefits"].str.lower()
df["title"] = df["title"].str.lower()
# Remove all NA
df["description"].fillna(" ", inplace = True)
df["requirements"].fillna(" ", inplace = True)
df["benefits"].fillna(" ", inplace = True)
df["title"].fillna(" ", inplace = True)
# Remove unnecessary words and punctuation / stemming
stop = set(stopwords.words('english'))
stop = list(stop)
def remove_URL(text):
url = re.compile(r'#url_\w*#')
return url.sub(r'url ',str(text))
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', str(text))
def remove_html(text):
html=re.compile(r'<.*?>')
return html.sub(r'html ',str(text))
def remove_stopwords(words):
return ' '.join(word for word in str(words).split() if word not in stop)
def remove_punctuation(words):
return ' '.join(word.strip(string.punctuation) for word in str(words).split())
def remove_dirty_words(words):
dirty_words=re.compile(r'[^\x00-\x7F]+|(&)|\d|[^\w\s]')
return dirty_words.sub(r' ',str(words))
def stemSentence(sentence):
porter=PorterStemmer()
token_words=word_tokenize(sentence)
token_words
stem_sentence=[]
for word in token_words:
stem_sentence.append(porter.stem(word))
stem_sentence.append(" ")
return "".join(stem_sentence)
df["description"] = df.description.apply(remove_URL)
df["description"] = df.description.apply(remove_html)
df["description"] = df.description.apply(remove_emoji)
df["description"] = df.description.apply(remove_dirty_words)
df["description"] = df.description.apply(remove_punctuation)
df["description"] = df.description.apply(remove_stopwords)
df["description"] = df.description.apply(stemSentence)
df["title"] = df.title.apply(remove_URL)
df["title"] = df.title.apply(remove_html)
df["title"] = df.title.apply(remove_emoji)
df["title"] = df.title.apply(remove_dirty_words)
df["title"] = df.title.apply(remove_punctuation)
df["title"] = df.title.apply(remove_stopwords)
df["title"] = df.title.apply(stemSentence)
df["benefits"] = df.benefits.apply(remove_URL)
df["benefits"] = df.benefits.apply(remove_html)
df["benefits"] = df.benefits.apply(remove_emoji)
df["benefits"] = df.benefits.apply(remove_dirty_words)
df["benefits"] = df.benefits.apply(remove_punctuation)
df["benefits"] = df.benefits.apply(remove_stopwords)
df["benefits"] = df.benefits.apply(stemSentence)
df["requirements"] = df.requirements.apply(remove_URL)
df["requirements"] = df.requirements.apply(remove_html)
df["requirements"] = df.requirements.apply(remove_emoji)
df["requirements"] = df.requirements.apply(remove_dirty_words)
df["requirements"] = df.requirements.apply(remove_punctuation)
df["requirements"] = df.requirements.apply(remove_stopwords)
df["requirements"] = df.requirements.apply(stemSentence)
# Feature Extraction
if condition == "test":
with open('./pickle/vec_description.pkl', 'rb') as f:
vec_description = pickle.load(f)
with open('./pickle/vec_title.pkl', 'rb') as f:
vec_title = pickle.load(f)
with open('./pickle/vec_benefits.pkl', 'rb') as f:
vec_benefits = pickle.load(f)
with open('./pickle/vec_requirements.pkl', 'rb') as f:
vec_requirements = pickle.load(f)
tfidf_description = vec_description.transform(df["description"])
features_name_desc = vec_description.get_feature_names_out() + "_desc"
desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)
tfidf_requirements = vec_requirements.transform(df["requirements"])
features_name_req = vec_requirements.get_feature_names_out() + "_req"
req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)
tfidf_title = vec_title.transform(df["title"])
features_name_title = vec_title.get_feature_names_out() + "_title"
title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
tfidf_benefits = vec_benefits.transform(df["benefits"])
features_name_benefits = vec_benefits.get_feature_names_out() +"_benefits"
benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
else:
vec_description = TfidfVectorizer(smooth_idf=True)
tfidf_description = vec_description.fit_transform(df["description"])
features_name_desc = vec_description.get_feature_names_out() + "_desc"
desc_features = pd.DataFrame(data = tfidf_description.toarray(), columns = features_name_desc)
vec_requirements = TfidfVectorizer(smooth_idf=True)
tfidf_requirements = vec_requirements.fit_transform(df["requirements"])
features_name_req = vec_requirements.get_feature_names_out() + "_req"
req_features = pd.DataFrame(data = tfidf_requirements.toarray(), columns = features_name_req)
vec_title = TfidfVectorizer(smooth_idf=True)
tfidf_title = vec_title.fit_transform(df["title"])
features_name_title = vec_title.get_feature_names_out() + "_title"
title_features = pd.DataFrame(data = tfidf_title.toarray(), columns = features_name_title)
vec_benefits = TfidfVectorizer(smooth_idf=True)
tfidf_benefits = vec_benefits.fit_transform(df["benefits"])
features_name_benefits = vec_benefits.get_feature_names_out() + "_benefits"
benefits_features = pd.DataFrame(data = tfidf_benefits.toarray(), columns = features_name_benefits)
feature_text = pd.concat([desc_features, req_features, title_features, benefits_features], axis=1)
with open('./pickle/vec_description.pkl', 'wb') as f:
pickle.dump(vec_description, f)
with open('./pickle/vec_title.pkl', 'wb') as f:
pickle.dump(vec_title, f)
with open('./pickle/vec_benefits.pkl', 'wb') as f:
pickle.dump(vec_benefits, f)
with open('./pickle/vec_requirements.pkl', 'wb') as f:
pickle.dump(vec_requirements, f)
return feature_text
def location_processing(df):
def extract_state(s):
""" Extract state from the location"""
""" The function can be used only when the state is formmated with two capital letter"""
""" Input: Series, iterable object"""
""" Output: List of States"""
s.fillna("No Location", inplace = True)
result = []
for i in np.arange(len(s)):
if (s[i].__contains__("US")):
extracted = re.findall(r'[A-Z]{2}', re.sub(r'[US]','',s[i]))
# Edge Case 1: Posting is from US but State is not posted
if extracted == []:
extracted = ["Domestic"]
# Edge Case 2: Regex detect a city name as a State name
if len(extracted) != 1:
while len(extracted) > 1:
extracted.pop()
result += extracted
else:
# Edge Case 3: Location is not given
if s[i] == ["No Location"]:
result += s[i]
# Edge Case 4: Location is given but not in US
elif re.findall(r'[A-Z]{2}', s[i]) != []:
result += ["Foreign"]
# Edge Case 5: Location cannot be identified from the given information
else:
result += ["No Location"]
return result
result = extract_state(df["location"])
df["state"] = result
return df
def OHE_processing(df, condition = "train"):
"""One Hot Encoding of all categorical variables"""
"""Input: Data Frame"""
"""Output: Data Frame of dummy variables"""
if condition == "test":
with open('./pickle/encoder_func.pkl', 'rb') as f:
encoder_func = pickle.load(f)
with open('./pickle/encoder_et.pkl', 'rb') as f:
encoder_et = pickle.load(f)
with open('./pickle/encoder_re.pkl', 'rb') as f:
encoder_re = pickle.load(f)
with open('./pickle/encoder_ind.pkl', 'rb') as f:
encoder_ind = pickle.load(f)
with open('./pickle/encoder_state.pkl', 'rb') as f:
encoder_state = pickle.load(f)
df["function"].fillna("NAN", inplace = True)
df["employment_type"].fillna("NAN", inplace = True)
df["required_experience"].fillna("NAN", inplace = True)
df["industry"].fillna("NAN", inplace = True)
df["state"].fillna("NAN", inplace = True)
encode_function = encoder_func.transform(df[['function']])
feature_name_func = encoder_func.get_feature_names_out()
encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)
encode_et = encoder_et.transform(df[['employment_type']])
feature_name_et = encoder_et.get_feature_names_out()
encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)
encode_re = encoder_re.transform(df[['required_experience']])
feature_name_re = encoder_re.get_feature_names_out()
encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)
encode_ind = encoder_ind.transform(df[['industry']])
feature_name_ind = encoder_ind.get_feature_names_out()
encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
encode_state = encoder_state.transform(df[['state']])
feature_name_state = encoder_state.get_feature_names_out()
encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
else:
df["function"].fillna("NAN", inplace = True)
df["employment_type"].fillna("NAN", inplace = True)
df["required_experience"].fillna("NAN", inplace = True)
df["industry"].fillna("NAN", inplace = True)
df["state"].fillna("NAN", inplace = True)
encoder_state = OneHotEncoder(handle_unknown = 'ignore')
encode_state = encoder_state.fit_transform(df[['state']])
feature_name_state = encoder_state.get_feature_names_out()
encoder_df_state = pd.DataFrame(encode_state.toarray(), columns = feature_name_state)
encoder_func = OneHotEncoder(handle_unknown = 'ignore')
encode_function = encoder_func.fit_transform(df[['function']])
feature_name_func = encoder_func.get_feature_names_out()
encoder_df_func = pd.DataFrame(encode_function.toarray(), columns = feature_name_func)
encoder_et = OneHotEncoder(handle_unknown = 'ignore')
encode_et = encoder_et.fit_transform(df[['employment_type']])
feature_name_et = encoder_et.get_feature_names_out()
encoder_df_et = pd.DataFrame(encode_et.toarray(), columns = feature_name_et)
encoder_re = OneHotEncoder(handle_unknown = 'ignore')
encode_re = encoder_re.fit_transform(df[['required_experience']])
feature_name_re = encoder_re.get_feature_names_out()
encoder_df_re = pd.DataFrame(encode_re.toarray(), columns = feature_name_re)
encoder_ind = OneHotEncoder(handle_unknown = 'ignore')
encode_ind = encoder_ind.fit_transform(df[['industry']])
feature_name_ind = encoder_ind.get_feature_names_out()
encoder_df_ind = pd.DataFrame(encode_ind.toarray(), columns = feature_name_ind)
ohe_feature = pd.concat([encoder_df_func, encoder_df_et, encoder_df_re, encoder_df_ind], axis=1)
with open('./pickle/encoder_func.pkl', 'wb') as f:
pickle.dump(encoder_func, f)
with open('./pickle/encoder_et.pkl', 'wb') as f:
pickle.dump(encoder_et, f)
with open('./pickle/encoder_re.pkl', 'wb') as f:
pickle.dump(encoder_re, f)
with open('./pickle/encoder_ind.pkl', 'wb') as f:
pickle.dump(encoder_ind, f)
with open('./pickle/encoder_state.pkl', 'wb') as f:
pickle.dump(encoder_state, f)
return ohe_feature
def final_processing(df):
""" Delete Unused columns, binarization and location extraction"""
""" Input: Data Frame """
""" Output: processed Data Frame"""
# Remove job_id, department, salary_range and location
df = df.drop(['job_id', 'department', 'salary_range', 'location'], axis=1)
df = df.drop(['description', 'requirements', 'title', 'benefits'], axis=1)
df = df.drop(['function', 'employment_type', 'required_experience', 'industry', 'state'], axis=1)
# binarize company_profile and required_education
bool_series_cp = pd.isnull(df["company_profile"])
df["company_profile"][bool_series_cp] = 1
df["company_profile"][~bool_series_cp] = 0
bool_series_re = pd.isnull(df["required_education"])
df["required_education"][bool_series_re] = 1
df["required_education"][~bool_series_re] = 0
return df
Please follow the following steps to process your data. Otherwise, you will face MemoryError.
train_data = pd.read_csv("./data/train_set.csv") #Load Data
text_features_train = text_processing(train_data) # Text Processing (Take Long Time)
joblib.dump(text_features_train, './data/text_features_train_jlib') # Save it as jlib file
train_data = location_processing(train_data) # add State to train data
OHE_features_train = OHE_processing(train_data) # OHE
joblib.dump(OHE_features_train, './data/OHE_features_train_jlib') # Save OHE features as jlib file
processed_train = final_processing(train_data) # Final Processing (will get several warning, ignore)
joblib.dump(processed_train, './data/processed_train_jlib') # Save processed train_data as jlib file
Combine these three files to get a whole matrix.
Here is how you unpack the joblib file:
text_features_train = joblib.load('./data/text_features_train_jlib')
Warning
Saving these file in csv or pickle can cause memory error because of its large file size. To store array-like object, joblib works better than pickle. Pickle is useful when we need to store non-array object like encoders and models.