This repository has been archived by the owner on Jan 25, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
87 lines (62 loc) · 2.38 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup
import urllib
import sys
from pymongo import MongoClient
class record:
def __init__(self,name,regno,city,state,chief,address,sectors):
self.name = name
self.regno = regno
self.city = city
self.state = state
self.chief = chief
self.address = address
self.sectors = sectors
def scrapestate(SC):
SCRAPE_URL = "http://ngo.india.gov.in/state_ngolist_ngo.php?records=65535&state_value="+SC
r = urllib.urlopen(SCRAPE_URL).read()
soup = BeautifulSoup(r)
table = soup.find("table",{"width":"100%","border":"0","align":"center","cellspacing":"0","cellpadding":"5"})
rows = table.findAll("tr")
statename = soup.findAll("strong")[2].text
statename = statename[0:statename.find("(")].strip()
cleanrows=[]
# this will clean the rows
for row in rows:
if len(row.findAll("td",{"valign":"top"})) == 6:
cleanrows.append(row)
recordlist = []
for row in cleanrows:
name = row.findAll("td")[1].text.strip()
str = row.findAll("td")[2].text
cities = []
cp = []
for i in range(len(str)):
if str[i] == ',':
cp.append(i)
for i in cp:
pos = str.rfind(')',0,i)
if pos != -1:
cities.append(str[str.rfind(')',0,i)+1:i].strip())
cities = filter(None,list(set(cities)))
regno = []
str = row.findAll("td")[2].text.strip()
regno.append(str[:str.find("(")].strip())
while str.find(statename) != -1:
str = str[str.find(statename)+len(statename):]
regno.append(str[0:str.find("(")].strip())
regno = filter(None,regno)
chief = row.findAll("td")[3].text.strip()
address = row.findAll("td")[4].text.strip()
sectors = row.findAll("td")[5].text.strip().split(',')
obj = record(name,regno,cities,statename,chief,address,sectors)
recordlist.append(obj)
return recordlist
client = MongoClient()
db = client.test
statecode = ["AN","AP","AR","AS","BR","CH","CG","DN","DD","DL","GA","GJ","HR","HP","JK","JH","KA","KL",
"LD","MP","MH","MN","ML","MZ","NL","OR","PY","PB","RJ","SK","TN","TR","UP","UA","WB"]
for SC in statecode:
records = scrapestate(SC)
for i in records:
db.ngodata.insert_one(i.__dict__)
print len(records),"records written for",SC