-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean_data.py
92 lines (67 loc) · 2.83 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 28 16:32:54 2018
@author: Leonova
"""
import docx
import glob
import os
import pandas as pd
def create_corpus_df(roles_folder, titleA, identifierA, titleB, titleB_primary = False):
""" Create a dataframe from a folder containing job posting .docx files.
Parameters
----------
roles_folder : str
Location of folder where all the job .docx files are stored.
titleA : str
Short title for a subset of jobs in the folder.
identifierA : str
Part of the job title that is used to distinguish titleA jobs
(Ex. 'analyst' in Marketing Data Analyst
or 'cientist' in "Staff Data Scientist")
titleB : str
Short title for other subset of jobs (this is the tile that will
be given to the rest of the jobs that were not picked
up by the identifier)
titleB_primary : bool, optional
Default is False, meaning that titleA is our primary role
and will be 1 in the returned df
Returns
-------
A dataframe where each row has the job title (name of the file),
shortened/simplified title, full job description, and boolean classifier.
"""
# --------------- PART 1: Aggregate all files in the folder -------------#
# Change Directory to where the files are stored
os.chdir(roles_folder)
# All files have been stored as .docx
text_filenames = glob.glob('*.docx')
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
file_text = []
for filename in text_filenames:
file_text.append(getText(filename))
# --------------- PART 2: Clean up the job titles -----------------------#
# Convert the lists into a DataFrame
df = pd.DataFrame({'title':text_filenames, 'description':file_text})
# Clean up the title column
df['title'] = df['title'].str.replace(".docx", "")
df['lower_case_title'] = [element.lower() for element in text_filenames]
# Identify if the job contains the key identifier
idA = str.lower(identifierA)
df['is_primary_role'] = df.lower_case_title.str.contains(idA).astype(int)
# Instead of using a loop, use the replace method
df['short_title'] = df['is_primary_role'].replace(1, titleA)
# Use the short_title column to replace the remaining 0s
df['short_title'] = df['short_title'].replace(0, titleB)
# Examine how many of each short_title there is
df['short_title'].value_counts()
# Should the primary role actually be for the second role?
# The one that didn't have a unique identifier
if titleB_primary:
df['is_primary_role'] = 1 - df['is_primary_role']
return df