-
Notifications
You must be signed in to change notification settings - Fork 1
/
importData.py
78 lines (65 loc) · 1.96 KB
/
importData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 6 21:27:19 2019
@author: Ilias
"""
import numpy as np
import pandas as pd
import statistics
import matplotlib.pyplot as plt
dataset = pd.read_csv('C:/Users/Ilias/Desktop/dataset1998/PRdata.txt')
itemId2=dataset.itemId[0:294] #classes-unique products
data=dataset.iloc[294:,0:2]
sessions=list(range(10001, 42712))
sessions=pd.DataFrame(sessions)
c=[]
for index, row in data.iterrows() :
if (data.A[index]=='V'):
c.append(data.itemId[index])
clicks=pd.DataFrame(c)#itemIds
#vriskw to mhkos tou kathe session
length=[]
count=0
for index, row in data.iterrows():
if (data.A[index]=='V'):
count=count+1
elif (data.A[index]=='C'):
length.append(count)
count=0
length.append(count)
length=pd.DataFrame(length)
length.drop(length.index[0], inplace = True)
'''pd.value_counts(clicks[0]).plot.bar()
plt.title('click class histogram')
plt.xlabel('item')
plt.ylabel('Frequency')
clicks[0].value_counts()'''
'''pd.value_counts(length[0]).plot.bar()
plt.title('sessionLength frequency histogram')
plt.xlabel('item')
plt.ylabel('Frequency')
length[0].value_counts()'''
#create a general dataFrame of the dataset
l=[]
s=[]
it=[]
Idx=0
for i in range(0,len(length)):
for j in range(0,int(length.iloc[i])):
l.append(int(length.iloc[i]))
s.append(int(sessions.iloc[i]))
it.append(int(clicks.iloc[Idx+j]))
Idx+=int(length.iloc[i])
d={'sessions':s,'length':l,'item':it}
df = pd.DataFrame(d)
#filter out sessions with only one click
#me 2 oi pinakes me unique
df=df[df.length != 1]
#df.to_csv('C:/Users/Ilias/Desktop/dataset1998/PRdataDataFrame.csv',index=False)
sessions2=df['sessions'].unique()
length2=length[length[0] != 1]#length2.mean()=4
'''pd.value_counts(length2[0]).plot.bar()
plt.title('sessionLength2 frequency histogram')
plt.xlabel('item')
plt.ylabel('Frequency')
length2[0].value_counts()'''