-
Notifications
You must be signed in to change notification settings - Fork 1
/
init_url_terms.py
69 lines (65 loc) · 2.41 KB
/
init_url_terms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import csv
from pymongo import MongoClient
import copy
dbcli = MongoClient('127.0.0.1', 8080)
db = dbcli['scrader']
db.authenticate('scrader', 'scr@d3r')
collection = db['url_terms']
with open('LIST_OF_URLS.csv') as csvfile:
reader = csv.DictReader(csvfile)
url_terms = []
for url_term in reader:
term = url_term.get('URL TERMS')
print term
url_terms.append(term)
url_dict = {'url_terms': url_terms}
collection.insert_one(url_dict)
#url_list = list(collection.find({}, {'_id': False}))
#urls = url_list[0].get('url_terms')
#print(len(urls))
#print urls[0]
#print urls[-1]
collection = db['scraper_companies']
companies = []
company_dict = {}
synonims_keys = ['SYNOMYM 1', 'SYNOMYM 2', 'SYNOMYM 3', 'SYNOMYM 4']
with open('COMPANY_NAMES.csv') as csvfile:
reader = csv.DictReader(csvfile)
for company in reader:
if company.get('COMPANY NAMES') != '':
#print company.get('COMPANY NAMES')
term = company.get('COMPANY NAMES')
new_comp_dict = copy.deepcopy(company_dict)
new_comp_dict['company_name'] = term
new_comp_dict['synonims'] = []
new_comp_dict['url_terms'] = []
new_comp_dict['url_terms'].append(company.get('URL TERMS'))
for syn in synonims_keys:
#print syn
#print(company.get(syn))
if company.get(syn) != "":
if company.get(syn) is not None:
#print company.get(syn)
new_comp_dict['synonims'].append(company.get(syn))
new_comp_dict['synonims'].append(term)
companies.append(new_comp_dict)
else:
#print(company.get('URL TERMS'))
new_comp_dict['url_terms'].append(company.get('URL TERMS'))
#collection.insert_many(companies)
#print(len(companies))
#print companies[76]
#print companies[32]
#print companies[900]
#print companies[1876]
#print companies[632]
#print companies[1899]
#scraper_companies = list(collection.find({}, {'_id': False}))
#print len(scraper_companies)
#print scraper_companies[0]
#print scraper_companies[-1]
#for comp in companies:
# if comp.get('company_name') == 'Citigroup':
# print comp.get('synonims')
# if comp.get('company_name') == 'Royal Dutch Shell':
# print comp.get('synonims')