-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_lost.py
59 lines (46 loc) · 2.94 KB
/
find_lost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Copyright © 2021 - wwyqianqian <wwyqianqian@mails.ccnu.edu.com>
movieid = list()
title = list()
genres = list()
search_mov_url = list()
detail_url = list()
got_movieid = list()
got_title = list()
got_genres = list()
got_search_mov_url = list()
got_detail_url = list()
got_img = list()
got_content = list()
lost_movieid = list()
# input format
# 1::Toy Story (1995)::Animation|Children's|Comedy::Toy+Story+%281995%29::https://www.imdb.com/title/tt0114709/
for line in open("excess_log.txt", encoding = "ISO-8859-1"):
movieid.append(line.rstrip().split("::")[0])
title.append(line.rstrip().split("::")[1])
genres.append(line.rstrip().split("::")[2])
search_mov_url.append(line.rstrip().split("::")[3])
detail_url.append(line.rstrip().split("::")[4])
# input format
# 1::Toy Story (1995)::Animation|Children's|Comedy::Toy+Story+%281995%29::https://www.imdb.com/title/tt0114709/::https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg::A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy's room.
for got_line in open("Scrape_IMDb.txt", encoding = "ISO-8859-1"):
got_movieid.append(got_line.rstrip().split("::")[0])
got_title.append(got_line.rstrip().split("::")[1])
got_genres.append(got_line.rstrip().split("::")[2])
got_search_mov_url.append(got_line.rstrip().split("::")[3])
got_detail_url.append(got_line.rstrip().split("::")[4])
got_img.append(got_line.rstrip().split("::")[5])
got_content.append(got_line.rstrip().split("::")[6])
for original_id_index in range(len(movieid)):
if movieid[original_id_index] not in got_movieid:
lost_movieid.append(movieid[original_id_index])
# if 90, then correct
# print(lost_movieid)
# ['32', '96', '140', '142', '253', '264', '328', '390', '402', '405', '477', '506', '559', '655', '718', '727', '730', '735', '752', '769', '770', '780', '855', '888', '1065', '1107', '1130', '1142', '1144', '1157', '1181', '1202', '1251', '1261', '1294', '1319', '1335', '1340', '1421', '1423', '1430', '1433', '1434', '1454', '1547', '1555', '1559', '1706', '1738', '1792', '1815', '1969', '1976', '1978', '1988', '2014', '2148', '2215', '2227', '2360', '2422', '2503', '2510', '2576', '2586', '2593', '2678', '2783', '2994', '3022', '3145', '3192', '3193', '3279', '3309', '3320', '3399', '3421', '3448', '3487', '3569', '3623', '3664', '3687', '3703', '3819', '3847', '3890', '3892', '3930']
for lost_id_index in range(len(lost_movieid)):
x = movieid.index(lost_movieid[lost_id_index])
with open('lost_id_1.txt', 'a+', encoding='utf-8') as f:
f.write(movieid[x] + "::" + title[x] + "::" + genres[x] + "::" + search_mov_url[x] + "::" + detail_url[x] + '\n')
# output format
# 140::Up Close and Personal (1996)::Drama|Romance::Up+Close+and+Personal+%281996%29::https://www.imdb.com/title/tt0497080/