-
Notifications
You must be signed in to change notification settings - Fork 11
/
load.py
36 lines (31 loc) · 978 Bytes
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from os import listdir
import json
import sys
import glob
# year = sys.argv[1] if len(sys.argv) > 1 else None
# if not year: raise "shit"
years = range(0, 1985)
movies = []
for year in years:
year = './OpenSubtitles2013/xml/en/'+str(year)
for movieDir in glob.glob(year + '/*' * 1):
movieFiles = listdir(movieDir)
jsonFile = filter(lambda x: x.endswith('.json'), movieFiles)
if not jsonFile: continue
script = filter(lambda x: x.endswith('.txt'), movieFiles)
if not script: continue
json_data=open(movieDir+'/'+jsonFile[0])
data = json.load(json_data)
innerDict = {}
for key, value in data.iteritems():
innerDict[key] = value
if innerDict.get('Genre'):
innerDict['Genre']= innerDict['Genre'].split(', ')
innerDict['osID'] = movieDir.split("/")[-1]
if not innerDict.get('Type')=='movie': continue
f = open(movieDir+'/'+script[0])
text= ""
for line in f.readlines():
text+=line
innerDict['script']=text
movies.append(innerDict)