Skip to content

Commit

Permalink
Add appbrain scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
ddxv committed Oct 17, 2023
1 parent e629f10 commit 7ff0f34
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 0 deletions.
112 changes: 112 additions & 0 deletions adscrawler/app_stores/appbrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import re
import time

import requests

from adscrawler.config import get_logger

logger = get_logger(__name__)

APPBRAIN_BASE_URL = "https://www.appbrain.com/apps/"

APPBRAIN_COLLECTIONS = ["hot", "hot-week", "popular", "highest-rated"]

APPBRAIN_CATEGORIES = [
"action",
"adventure",
"arcade",
"art-and-design",
"auto-and-vehicles",
"beauty",
"board",
"books-and-reference",
"business",
"card",
"casino",
"casual",
"comics",
"communication",
"dating",
"education",
"educational",
"entertainment",
"events",
"finance",
"food-and-drink",
"health-and-fitness",
"house-and-home",
"libraries-and-demo",
"lifestyle",
"maps-and-navigation",
"medical",
"music",
"music-and-audio",
"news-and-magazines",
"parenting",
"personalization",
"photography",
"productivity",
"puzzle",
"racing",
"role-playing",
"simulation",
"social",
"sports",
"sports-games",
"strategy",
"tools",
"travel-and-local",
"trivia",
"video-players-and-editors",
"weather",
"word",
]

HTML_PATTERN = r'href="/app/.*?/([^"/]+)"'


HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0"
}


def scrape_for_ids(collection: str, category: str | None = None) -> list[str]:
if category is None:
url_part = f"/{collection}/new"
else:
url_part = f"/{collection}/{category}/new"
response = requests.get(APPBRAIN_BASE_URL + url_part, headers=HEADERS)
if response.status_code == 200:
packages = re.findall(HTML_PATTERN, response.content.decode("utf-8"))
else:
logger.error(
f"Response code not 200: {response.status_code=} {response.content=}"
)
packages = []
time.sleep(1)
return packages


def loop_categories() -> list[str]:
all_packages: list[str] = []
for collection in APPBRAIN_COLLECTIONS:
packages = scrape_for_ids(collection=collection)
all_packages = list(set(packages + all_packages))
logger.info(
f"AppBrain {collection=} total:{len(all_packages)} packages:{len(packages)}"
)
time.sleep(1)
for category in APPBRAIN_CATEGORIES:
packages = scrape_for_ids(collection=collection, category=category)
all_packages = list(set(packages + all_packages))
logger.info(
f"AppBrain {collection=} {category=} total:{len(all_packages)} packages:{len(packages)}"
)
time.sleep(1)
return all_packages


def get_appbrain_android_apps() -> list[dict]:
scraped_ids = loop_categories()
dicts = [{"store": 1, "store_id": x} for x in scraped_ids]
return dicts
10 changes: 10 additions & 0 deletions adscrawler/app_stores/scrape_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from itunes_app_scraper.util import AppStoreException

from adscrawler.app_stores.apkcombo import get_apkcombo_android_apps
from adscrawler.app_stores.appbrain import get_appbrain_android_apps
from adscrawler.app_stores.apple import (
clean_ios_app_df,
crawl_ios_developers,
Expand Down Expand Up @@ -83,6 +84,15 @@ def scrape_stores_frontpage(
)
except Exception:
logger.exception("ApkCombo RSS feed failed")
try:
dicts = get_appbrain_android_apps()
process_scraped(
database_connection=database_connection,
ranked_dicts=dicts,
crawl_source="scrape_appbrain",
)
except Exception:
logger.exception("ApkCombo RSS feed failed")
return


Expand Down

0 comments on commit 7ff0f34

Please sign in to comment.