Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix None page causing pending forever #131

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions examples/only_grab.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
checking and save them to a file."""

import asyncio
import warnings
import logging

from proxybroker import Broker
from proxybroker.providers import Provider, Blogspot_com, Spys_ru, Proxylist_me


async def save(proxies, filename):
Expand All @@ -12,20 +15,38 @@ async def save(proxies, filename):
while True:
proxy = await proxies.get()
if proxy is None:
logging.info('got None from proxies queue')
break
f.write('%s:%d\n' % (proxy.host, proxy.port))
for proto in proxy.types or ['http', 'https']:
proto = proto.lower()
row = '%s://%s:%d\n' % (proto, proxy.host, proxy.port)
f.write(row)


def main():
providers = [
# Blogspot_com(proto=('HTTP', 'HTTPS')), # noqa; 24800
Provider(
url='https://geekelectronics.org/my-servisy/proxy',
proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'),
), # 400
Spys_ru(proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25')), # noqa; 660
]
proxies = asyncio.Queue()
broker = Broker(proxies)
# broker = Broker(proxies, providers=providers)
tasks = asyncio.gather(
broker.grab(countries=['US', 'GB'], limit=10),
broker.grab(),
save(proxies, filename='proxies.txt'),
)
loop = asyncio.get_event_loop()
loop.set_debug(True)
loop.slow_callback_duration = 1
# Report all mistakes managing asynchronous resources.
warnings.simplefilter('always', ResourceWarning)
loop.run_until_complete(tasks)


if __name__ == '__main__':
logging.basicConfig(level='INFO')
main()
2 changes: 1 addition & 1 deletion proxybroker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def _get_tasks(by=MAX_CONCURRENT_PROVIDERS):
proxies = await task
for proxy in proxies:
await self._handle(proxy, check=check)
log.debug('Grab cycle is complete')
log.info('Grab cycle is complete')
if self._server:
log.debug('fall asleep for %d seconds' % GRAB_PAUSE)
await asyncio.sleep(GRAB_PAUSE)
Expand Down
63 changes: 43 additions & 20 deletions proxybroker/providers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import os
import re
import warnings
from base64 import b64decode
Expand Down Expand Up @@ -79,10 +80,7 @@ async def get_proxies(self):
) as self._session:
await self._pipe()

log.debug(
'%d proxies received from %s: %s'
% (len(self.proxies), self.domain, self.proxies)
)
log.info(f'{len(self.proxies)} proxies received from {self.domain}')
return self.proxies

async def _pipe(self):
Expand All @@ -103,6 +101,8 @@ async def _find_on_pages(self, urls):

async def _find_on_page(self, url, data=None, headers=None, method='GET'):
page = await self.get(url, data=data, headers=headers, method=method)
if not page:
return
oldcount = len(self.proxies)
try:
received = self.find_proxies(page)
Expand All @@ -112,9 +112,12 @@ async def _find_on_page(self, url, data=None, headers=None, method='GET'):
'Error when executing find_proxies.'
'Domain: %s; Error: %r' % (self.domain, e)
)
if not received:
log.error(f'Got 0 proxies from {url}')
return
self.proxies = received
added = len(self.proxies) - oldcount
log.debug(
log.info(
'%d(%d) proxies added(received) from %s'
% (added, len(received), url)
)
Expand Down Expand Up @@ -151,7 +154,7 @@ async def _get(self, url, data=None, headers=None, method='GET'):
aiohttp.ServerDisconnectedError,
) as e:
page = ''
log.debug('%s is failed. Error: %r;' % (url, e))
log.info('%s is failed. Error: %r;' % (url, e))
return page

def find_proxies(self, page):
Expand All @@ -168,7 +171,7 @@ class Freeproxylists_com(Provider):
async def _pipe(self):
exp = r'''href\s*=\s*['"](?P<t>[^'"]*)/(?P<uts>\d{10})[^'"]*['"]'''
urls = [
'http://www.freeproxylists.com/socks.html',
# 'http://www.freeproxylists.com/socks.html',
'http://www.freeproxylists.com/elite.html',
'http://www.freeproxylists.com/anonymous.html',
]
Expand Down Expand Up @@ -213,6 +216,8 @@ class Webanetlabs_net(Provider):
async def _pipe(self):
exp = r'''href\s*=\s*['"]([^'"]*proxylist_at_[^'"]*)['"]'''
page = await self.get('https://webanetlabs.net/publ/24')
if not page:
return
urls = [
'https://webanetlabs.net%s' % path for path in re.findall(exp, page)
]
Expand All @@ -225,6 +230,8 @@ class Checkerproxy_net(Provider):
async def _pipe(self):
exp = r'''href\s*=\s*['"](/archive/\d{4}-\d{2}-\d{2})['"]'''
page = await self.get('https://checkerproxy.net/')
if not page:
return
urls = [
'https://checkerproxy.net/api%s' % path
for path in re.findall(exp, page)
Expand All @@ -244,6 +251,8 @@ async def _pipe(self):
) # noqa
url = 'http://www.proxz.com/proxy_list_high_anonymous_0.html'
page = await self.get(url)
if not page:
return
urls = [
'http://www.proxz.com/%s' % path for path in re.findall(exp, page)
]
Expand All @@ -264,6 +273,8 @@ async def _pipe(self):
exp = r'''href\s*=\s*['"]\./([^'"]?index\.php\?p=\d+[^'"]*)['"]'''
url = 'http://proxy-list.org/english/index.php?p=1'
page = await self.get(url)
if not page:
return
urls = [
'http://proxy-list.org/english/%s' % path
for path in re.findall(exp, page)
Expand All @@ -278,7 +289,7 @@ class Aliveproxy_com(Provider):

async def _pipe(self):
paths = [
'socks5-list',
# 'socks5-list',
'high-anonymity-proxy-list',
'anonymous-proxy-list',
'fastest-proxies',
Expand Down Expand Up @@ -306,6 +317,8 @@ class Maxiproxies_com(Provider):
async def _pipe(self):
exp = r'''<a href\s*=\s*['"]([^'"]*example[^'"#]*)['"]>'''
page = await self.get('http://maxiproxies.com/category/proxy-lists/')
if not page:
return
urls = re.findall(exp, page)
await self._find_on_pages(urls)

Expand All @@ -316,6 +329,8 @@ class _50kproxies_com(Provider):
async def _pipe(self):
exp = r'''<a href\s*=\s*['"]([^'"]*-proxy-list-[^'"#]*)['"]>'''
page = await self.get('http://50kproxies.com/category/proxy-list/')
if not page:
return
urls = re.findall(exp, page)
await self._find_on_pages(urls)

Expand All @@ -326,6 +341,8 @@ class Proxylist_me(Provider):
async def _pipe(self):
exp = r'''href\s*=\s*['"][^'"]*/?page=(\d+)['"]'''
page = await self.get('https://proxylist.me/')
if not page:
return
lastId = max([int(n) for n in re.findall(exp, page)])
urls = ['https://proxylist.me/?page=%d' % n for n in range(lastId)]
await self._find_on_pages(urls)
Expand Down Expand Up @@ -503,6 +520,8 @@ class Proxynova_com(Provider):
async def _pipe(self):
expCountries = r'"([a-z]{2})"'
page = await self.get('https://www.proxynova.com/proxy-server-list/')
if not page:
return
tpl = 'https://www.proxynova.com/proxy-server-list/country-%s/'
urls = [
tpl % isoCode
Expand Down Expand Up @@ -548,6 +567,8 @@ async def _pipe(self):
expSession = r"'([a-z0-9]{32})'"
url = 'http://spys.one/proxies/'
page = await self.get(url)
if not page:
return
sessionId = re.findall(expSession, page)[0]
data = {
'xf0': sessionId, # session id
Expand All @@ -574,6 +595,8 @@ async def _pipe(self):
exp = r'''href\s*=\s*['"]([^'"]?free-[^'"]*)['"]'''
url = 'https://www.my-proxy.com/free-proxy-list.html'
page = await self.get(url)
if not page:
return
urls = [
'https://www.my-proxy.com/%s' % path
for path in re.findall(exp, page)
Expand Down Expand Up @@ -670,7 +693,7 @@ class Proxylistplus_com(Provider):
domain = 'list.proxylistplus.com'

async def _pipe(self):
names = ['Fresh-HTTP-Proxy', 'SSL', 'Socks']
names = ['Fresh-HTTP-Proxy'] # , 'SSL', 'Socks']
urls = [
'http://list.proxylistplus.com/%s-List-%d' % (i, n)
for i in names
Expand All @@ -686,8 +709,8 @@ async def _pipe(self):
urls = [
'https://www.proxy-list.download/api/v1/get?type=http',
'https://www.proxy-list.download/api/v1/get?type=https',
'https://www.proxy-list.download/api/v1/get?type=socks4',
'https://www.proxy-list.download/api/v1/get?type=socks5',
# 'https://www.proxy-list.download/api/v1/get?type=socks4',
# 'https://www.proxy-list.download/api/v1/get?type=socks5',
]
await self._find_on_pages(urls)

Expand Down Expand Up @@ -731,10 +754,10 @@ def __init__(self, *args, **kwargs):
proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'),
), # 200
Provider(
url='http://fineproxy.org/eng/fresh-proxies/',
url='https://t.me/s/proxiesfine',
proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'),
), # 5500
Provider(url='https://socks-proxy.net/', proto=('SOCKS4', 'SOCKS5')), # 80
), # 4200
# Provider(url='https://socks-proxy.net/', proto=('SOCKS4', 'SOCKS5')), # 80
Provider(
url='http://www.httptunnel.ge/ProxyListForFree.aspx',
proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'),
Expand Down Expand Up @@ -782,12 +805,12 @@ def __init__(self, *args, **kwargs):
Blogspot_com(
proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25')
), # noqa; 24800
Gatherproxy_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 30
Blogspot_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1486
Tools_rosinstrument_com(
proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25')
), # noqa; 4000
Tools_rosinstrument_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1800
# Gatherproxy_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 30
# Blogspot_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1486
# Tools_rosinstrument_com(
# proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25')
# ), # noqa; 4000
# Tools_rosinstrument_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1800
My_proxy_com(max_conn=2), # noqa; 1000
Checkerproxy_net(), # noqa; 60000
Aliveproxy_com(), # noqa; 210
Expand Down