-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAdult_Literacy.py
194 lines (124 loc) · 6.61 KB
/
Adult_Literacy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 15 18:27:38 2020
@author: Deep Patel
Data Analysis of Adult Literacy (Various Data Queries)
"""
import pandas as pd
#import numpy as np
import os
from datetime import datetime
#from datetime import date
os.chdir("C:\\Users\\deepp\\Google Drive")
# Reading the excel file for each dataset
student = pd.read_excel('Student.xlsx')
tutor = pd.read_excel('Tutor.xlsx')
match_history = pd.read_excel('Match_History.xlsx')
# Question 1
# Filters Dataframe for Tutors with Dropped Status and specified date criteria
tutor_drop = tutor[(tutor['TutorStatus']=='Dropped') &
(tutor['CertDate']>'2018-04-01')]
print("Q1: Tutors IDs of Tutors who have a Dropped status and have achieved "
"their certification after 4/01/2018 are ", list(tutor_drop['TutorID']),
"\n"*2)
# Question 2
# Replacing NA/missing End Dates with current date
today= datetime.today().strftime('%Y-%m-%d')
match_history['EndDate'] = (match_history['EndDate'].fillna(today))
match_history['EndDate']=pd.to_datetime(match_history['EndDate'],
format= '%Y-%m-%d')
# Converting Start Date and End Date to datetime for arithmetic operations
match_history['EndDate']=pd.to_datetime(match_history['EndDate'])
match_history['StartDate']=pd.to_datetime(match_history['StartDate'])
# Creating new column to count number of days between start date and end date
match_history['Length']= (match_history['EndDate']
- match_history['StartDate']).dt.days
# Counts the average length of number of tutoting days
average_length= match_history['Length'].mean()
print ("Q2: The average length of time student stayed (or has stayed) in "\
"the program is: ", average_length, "\n"*2)
# Question 3
# Merges two dataframes by column name
match_tutor = pd.merge(match_history,tutor, on="TutorID")
# Filters Dataframe with Temp Stop Status of Tutor and specified date criteria
student_match= match_tutor[(match_tutor['TutorStatus']=='Temp Stop') &
(match_tutor['StartDate']>='2018-01-01')]
# Creating new dataframe to see relevant columns
student_temp= student_match[['StudentID','StartDate','EndDate',
'TutorID','TutorStatus']]
print ("Q3: The query for students who have been matched in 2018 with a "
"tutor whose status is Temp Stop. is as follows: \n",
student_temp, "\n")
print("The list of IDs of these students is ",
list(student_temp['StudentID']), "\n"*2)
# Question 4
# Mereges two dataframes by column name
score_join = pd.merge(match_tutor, student, on="StudentID")
# Filters Dataframe with Dropped status of Tutor
score_match= score_join[(score_join['TutorStatus']=='Dropped')]
# Creating new dataframe to see relevant columns
score_status_dropped = score_match[['ReadScore','StudentID','TutorStatus']]
print ("Q4: The query for read scores of students who were ever taught by "\
"tutors whose status is Dropped is as follows: \n",
score_status_dropped, "\n")
print("The list of Read Scores for these students is: ",
list(score_status_dropped['ReadScore']), "\n"*2)
# Question 5
# Filters dataframe with repeated TutorIDs in TutorID column
tutor_2students = match_history[match_history.duplicated(subset=['TutorID'],
keep=False)]
print("Q5: The query for tutors who taught two or more students is displayed "
"below: \n", tutor_2students, "\n")
print("The list of these tutors by thier tutor ID is: ",
list(set(list(tutor_2students['TutorID']))), "\n"*2)
# Question 6
# Creating new dataframe to see relevant columns for the specified query
student_tutor = score_join[['StudentID', 'ReadScore',
'TutorID', 'TutorStatus']]
print("Q6: List of all students, their read score, their tutors, and "
"tutors status is shown below and the file data is stored in "
".xlsx file: \n", student_tutor, "\n"*2)
# Writes the information the .xslx file
student_tutor.to_excel('Student_Tutor.xlsx', index=False)
# Question 7
# Creating new dataframe to see relevant columns for the specified query
group_filter= score_join[['StudentGroup','TutorID']]
# Tallies Tutors across different Student Groups
tutors_tally= pd.crosstab(group_filter['StudentGroup'],
group_filter['TutorID'], margins=True)
# Keeps only relevant columns and rows for the specified query
tally_filter = tutors_tally[['All']]
num_of_tutors= tally_filter[:-1]
number_of_tutors= num_of_tutors.rename(columns={'All':'Number of Tutors'})
number_of_tutors.columns.name=None
print("Q7: The number of tutors who have been matched with each Student "
"Group is shown below: \n", number_of_tutors,"\n"*2 )
# Question 8
# Filters dataframe with for active students with specified date criteria
active= match_history[(match_history['StartDate']>='2018-05-01') &
(match_history['StartDate']<='2018-06-30') &
(match_history['EndDate']==today)]
# Creating new dataframe for active students with relevant columns
active_students= active[['StudentID','StartDate','EndDate']]
print("Q8: The query for all active students who started in May and June "
"is shown below: \n", active_students,"\n")
print("The list of IDs of these active students is: ",
list(active_students['StudentID']), '\n'*2)
# Question 9
# Merges two dataframes by column name and keeping all values
student_tutor= pd.merge(student,match_history, on='StudentID', how='outer')
# Creating new dataframe with relevant columns for the query
student_tutor_filter= student_tutor[['StudentID','TutorID']]
# Filters students who has not been tutored yet
no_tutor= student_tutor_filter[(student_tutor_filter['TutorID'].isnull())]
print("Q9: The students IDs of students who has not been tutored yet are ",
list(no_tutor['StudentID']), "\n"*2)
# Question 10
# Merges two dataframes by column name and keeping all values
tutor_student= pd.merge(tutor,match_history, on='TutorID', how='outer')
# Creating new dataframe with relevant columns for the query
tutor_student_filter= tutor_student[['TutorID','StudentID']]
# Filters tutors who has not tutored yet
no_students= tutor_student_filter[tutor_student_filter['StudentID'].isnull()]
print("Q10: The tutor IDs of tutors who did not tutor any students are ",
list(no_students['TutorID']), "\n"*2)