From 296e43680e62ac369df83e14f2b17a3e59a8b5ae Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 2 Oct 2023 02:38:31 +0100 Subject: [PATCH 01/11] [XHamster] Set default UA 'Mozilla' to bypass captcha page Resolves #32539 --- youtube_dl/extractor/xhamster.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index e17947fc6a7..d6d8ec05e25 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -5,7 +5,10 @@ import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_kwargs, + compat_str, +) from ..utils import ( clean_html, determine_ext, @@ -23,7 +26,28 @@ ) -class XHamsterIE(InfoExtractor): +class XHamsterBaseIE(InfoExtractor): + def _download_webpage_handle(self, url, video_id, *args, **kwargs): + # note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None) + # default UA to 'Mozilla' (only) to avoid interstitial page + headers = (args[5] if len(args) > 5 else kwargs.get('headers')) + if 'User-Agent' not in (headers or {}): + if len(args) > 5: + args = list(args) + headers = headers or {} + args[5] = headers + elif not isinstance(headers, dict): + headers = {} + headers['User-Agent'] = 'Mozilla' + if len(args) <= 5: + if not kwargs.get('headers'): + kwargs['headers'] = headers + kwargs = compat_kwargs(kwargs) + return super(XHamsterBaseIE, self)._download_webpage_handle( + url, video_id, *args, **kwargs) + + +class XHamsterIE(XHamsterBaseIE): _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' _VALID_URL = r'''(?x) https?:// @@ -377,7 +401,7 @@ def get_height(s): } -class XHamsterEmbedIE(InfoExtractor): +class XHamsterEmbedIE(XHamsterBaseIE): _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', From bafa9d7d0190f09bc7b1b5d301cb90965df7a060 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 2 Oct 2023 02:46:44 +0100 Subject: [PATCH 02/11] [XHamster] Update domain list * include domains listed as trusted in page, aliased to xhamster.com * excluding domains that redirect to xhamster (eg xhday.com) --- youtube_dl/extractor/xhamster.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d6d8ec05e25..7538b73c48b 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -48,7 +48,17 @@ def _download_webpage_handle(self, url, video_id, *args, **kwargs): class XHamsterIE(XHamsterBaseIE): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' + # base domains that don't redirect to xhamster.com (not xhday\d\.com, eg) + _DOMAINS = '(?:%s)' % '|'.join(( + r'xhamster\d*\.(?:com|desi)', + r'xhamster\.one', + r'xhms\.pro', + r'xh(?:open|access|victory|big|channel)\.com', + r'(?:full|mega)xh\.com', + r'xh(?:vid|official|planet)\d*\.com', + # requires Tor + r'xhamster[a-z2-7]+\.onion', + )) _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ @@ -145,9 +155,7 @@ class XHamsterIE(XHamsterBaseIE): 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', 'only_matching': True, }, { - 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', - 'only_matching': True, - }, { + # 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', 'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6', 'only_matching': True, }] @@ -458,9 +466,8 @@ class XHamsterUserIE(InfoExtractor): }, 'playlist_mincount': 1, }, { - 'url': 'https://xhday.com/users/mobhunter', - 'only_matching': True, - }, { + # the below doesn't match but is redirected via generic + # 'url': 'https://xhday.com/users/mobhunter', 'url': 'https://xhvid.com/users/pelushe21', 'only_matching': True, }] From 6845e4e97146765dbeca480c584efb39ee8a50e3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 2 Oct 2023 02:50:29 +0100 Subject: [PATCH 03/11] [XHamster] Revise video extraction * re-factor extraction code * use traverse_obj() --- youtube_dl/extractor/xhamster.py | 334 ++++++++++++++----------------- 1 file changed, 151 insertions(+), 183 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 7538b73c48b..f84d55118b7 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -7,19 +7,22 @@ from .common import InfoExtractor from ..compat import ( compat_kwargs, - compat_str, + compat_urlparse, ) from ..utils import ( clean_html, determine_ext, - dict_get, extract_attributes, ExtractorError, float_or_none, int_or_none, + join_nonempty, + merge_dicts, parse_duration, - str_or_none, - try_get, + parse_qs, + T, + traverse_obj, + txt_or_none, unified_strdate, url_or_none, urljoin, @@ -160,10 +163,107 @@ class XHamsterIE(XHamsterBaseIE): 'only_matching': True, }] + def _get_height(self, s): + return int_or_none(self._search_regex( + r'^(\d+)[pP]', s, 'height', default=None)) + + def _extract_initials(self, initials, video_id, display_id, url, referrer, age_limit): + video = initials['videoModel'] + title = video['title'] + formats = [] + format_urls = set() + format_sizes = {} + http_headers = {'Referer': referrer} + for quality, size in traverse_obj(video, ( + 'sources', 'download', T(dict.items), Ellipsis, + T(lambda kv: (kv[0], float_or_none(kv[1]['size']))), + T(lambda kv: (kv[1] is not None) and kv))): + format_sizes[quality] = size + # Download link takes some time to be generated, + # skipping for now + for format_id, formats_dict in traverse_obj(video, ( + 'sources', T(dict.items), + lambda _, kv: kv[0] != 'download' and isinstance(kv[1], dict))): + for quality, format_url in traverse_obj(formats_dict, ( + T(dict.items), Ellipsis, + T(lambda kv: (kv[0], url_or_none(kv[1]))))): + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': format_url, + 'ext': determine_ext(format_url, 'mp4'), + 'height': self._get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': http_headers, + }) + xplayer_sources = traverse_obj( + initials, ('xplayerSettings', 'sources', T(dict))) + for hls_url in traverse_obj( + xplayer_sources, + ('hls', ('url', 'fallback'), T(lambda u: urljoin(url, u)))): + if hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + for format_id, formats_list in traverse_obj( + xplayer_sources, ('standard', T(dict.items), Ellipsis)): + for standard_format in traverse_obj(formats_list, Ellipsis): + for standard_url in traverse_obj( + standard_format, + (('url', 'fallback'), T(lambda u: urljoin(url, u)))): + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + quality = traverse_obj(standard_format, (('quality', 'label'), T(txt_or_none)), get_all=False) or '' + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': self._get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + categories = traverse_obj(video, ('categories', Ellipsis, 'name', T(txt_or_none))) or None + uploader_url = traverse_obj(video, ('author', 'pageURL', T(url_or_none))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'uploader_url': uploader_url, + 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, + 'age_limit': age_limit if age_limit is not None else 18, + 'categories': categories, + 'formats': formats, + }, traverse_obj(video, { + 'description': ('description', T(txt_or_none)), + 'timestamp': ('created', T(int_or_none)), + 'uploader': ('author', 'name', T(txt_or_none)), + 'thumbnail': ('thumbURL', T(url_or_none)), + 'duration': ('duration', T(int_or_none)), + 'view_count': ('views', T(int_or_none)), + 'like_count': ('rating', 'likes', T(int_or_none)), + 'dislike_count': ('rating', 'dislikes', T(int_or_none)), + 'comment_count': ('comments', T(int_or_none)), + })) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_2') - display_id = mobj.group('display_id') or mobj.group('display_id_2') + mobj = self._match_valid_url(url) + video_id = traverse_obj(mobj, 'id', 'id_2') + display_id = traverse_obj(mobj, 'display_id', 'display_id_2') desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) webpage, urlh = self._download_webpage_handle(desktop_url, video_id) @@ -176,139 +276,19 @@ def _real_extract(self, url): age_limit = self._rta_search(webpage) - def get_height(s): - return int_or_none(self._search_regex( - r'^(\d+)[pP]', s, 'height', default=None)) - initials = self._parse_json( self._search_regex( (r'window\.initials\s*=\s*({.+?})\s*;\s*', r'window\.initials\s*=\s*({.+?})\s*;'), webpage, 'initials', default='{}'), video_id, fatal=False) + if initials: - video = initials['videoModel'] - title = video['title'] - formats = [] - format_urls = set() - format_sizes = {} - sources = try_get(video, lambda x: x['sources'], dict) or {} - for format_id, formats_dict in sources.items(): - if not isinstance(formats_dict, dict): - continue - download_sources = try_get(sources, lambda x: x['download'], dict) or {} - for quality, format_dict in download_sources.items(): - if not isinstance(format_dict, dict): - continue - format_sizes[quality] = float_or_none(format_dict.get('size')) - for quality, format_item in formats_dict.items(): - if format_id == 'download': - # Download link takes some time to be generated, - # skipping for now - continue - format_url = format_item - format_url = url_or_none(format_url) - if not format_url or format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'format_id': '%s-%s' % (format_id, quality), - 'url': format_url, - 'ext': determine_ext(format_url, 'mp4'), - 'height': get_height(quality), - 'filesize': format_sizes.get(quality), - 'http_headers': { - 'Referer': urlh.geturl(), - }, - }) - xplayer_sources = try_get( - initials, lambda x: x['xplayerSettings']['sources'], dict) - if xplayer_sources: - hls_sources = xplayer_sources.get('hls') - if isinstance(hls_sources, dict): - for hls_format_key in ('url', 'fallback'): - hls_url = hls_sources.get(hls_format_key) - if not hls_url: - continue - hls_url = urljoin(url, hls_url) - if not hls_url or hls_url in format_urls: - continue - format_urls.add(hls_url) - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - standard_sources = xplayer_sources.get('standard') - if isinstance(standard_sources, dict): - for format_id, formats_list in standard_sources.items(): - if not isinstance(formats_list, list): - continue - for standard_format in formats_list: - if not isinstance(standard_format, dict): - continue - for standard_format_key in ('url', 'fallback'): - standard_url = standard_format.get(standard_format_key) - if not standard_url: - continue - standard_url = urljoin(url, standard_url) - if not standard_url or standard_url in format_urls: - continue - format_urls.add(standard_url) - ext = determine_ext(standard_url, 'mp4') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - standard_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - quality = (str_or_none(standard_format.get('quality')) - or str_or_none(standard_format.get('label')) - or '') - formats.append({ - 'format_id': '%s-%s' % (format_id, quality), - 'url': standard_url, - 'ext': ext, - 'height': get_height(quality), - 'filesize': format_sizes.get(quality), - 'http_headers': { - 'Referer': standard_url, - }, - }) - self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) - - categories_list = video.get('categories') - if isinstance(categories_list, list): - categories = [] - for c in categories_list: - if not isinstance(c, dict): - continue - c_name = c.get('name') - if isinstance(c_name, compat_str): - categories.append(c_name) - else: - categories = None - - uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video.get('description'), - 'timestamp': int_or_none(video.get('created')), - 'uploader': try_get( - video, lambda x: x['author']['name'], compat_str), - 'uploader_url': uploader_url, - 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, - 'thumbnail': video.get('thumbURL'), - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('views')), - 'like_count': int_or_none(try_get( - video, lambda x: x['rating']['likes'], int)), - 'dislike_count': int_or_none(try_get( - video, lambda x: x['rating']['dislikes'], int)), - 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit if age_limit is not None else 18, - 'categories': categories, - 'formats': formats, - } + return self._extract_initials(initials, video_id, display_id, url, urlh.geturl, age_limit) + + return self._old_real_extract(webpage, video_id, display_id, age_limit) + + def _old_real_extract(self, webpage, video_id, display_id, age_limit): # Old layout fallback @@ -326,17 +306,17 @@ def get_height(s): r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', default='{}'), video_id, fatal=False) - for format_id, format_url in sources.items(): - format_url = url_or_none(format_url) - if not format_url: - continue + for format_id, format_url in traverse_obj(sources, ( + T(dict.items), Ellipsis, + T(lambda kv: (kv[0], url_or_none(kv[1]))), + T(lambda kv: kv[1] and kv))): if format_url in format_urls: continue format_urls.add(format_url) formats.append({ 'format_id': format_id, 'url': format_url, - 'height': get_height(format_id), + 'height': self._get_height(format_id), }) video_url = self._search_regex( @@ -351,62 +331,49 @@ def get_height(s): self._sort_formats(formats) - # Only a few videos have an description - mobj = re.search(r'Description: ([^<]+)', webpage) - description = mobj.group(1) if mobj else None - - upload_date = unified_strdate(self._search_regex( - r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', - webpage, 'upload date', fatal=False)) - uploader = self._html_search_regex( r']+itemprop=["\']author[^>]+>]+>]+>([^<]+)', webpage, 'uploader', default='anonymous') - thumbnail = self._search_regex( - [r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', - r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''], - webpage, 'thumbnail', fatal=False, group='thumbnail') - - duration = parse_duration(self._search_regex( - [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', - r'Runtime:\s*\s*([\d:]+)'], webpage, - 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'content=["\']User(?:View|Play)s:(\d+)', - webpage, 'view count', fatal=False)) - - mobj = re.search(r'hint=[\'"](?P\d+) Likes / (?P\d+) Dislikes', webpage) - (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) - - mobj = re.search(r'Comments \((?P\d+)\)', webpage) - comment_count = mobj.group('commentcount') if mobj else 0 - - categories_html = self._search_regex( - r'(?s)Categories:.+?)', webpage, - 'categories', default=None) categories = [clean_html(category) for category in re.findall( - r']+>(.+?)', categories_html)] if categories_html else None + r']+>(.+?)', self._search_regex( + r'(?s)Categories:.+?)', webpage, + 'categories', default=''))] - return { + return merge_dicts({ 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'upload_date': upload_date, + # Only a few videos have a description + 'description': traverse_obj( + re.search(r'Description:\s*([^<]+)', webpage), 1), + 'upload_date': unified_strdate(self._search_regex( + r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', + webpage, 'upload date', fatal=False)), 'uploader': uploader, - 'uploader_id': uploader.lower() if uploader else None, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), - 'comment_count': int_or_none(comment_count), + 'uploader_id': (uploader or '').lower() or None, + 'thumbnail': url_or_none(self._search_regex( + (r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', + r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''), + webpage, 'thumbnail', fatal=False, group='thumbnail')), + 'duration': parse_duration(self._search_regex( + (r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', + r'Runtime:\s*\s*([\d:]+)'), webpage, + 'duration', fatal=False)), + 'view_count': int_or_none(self._search_regex( + r'content=["\']User(?:View|Play)s:\s*(\d+)', + webpage, 'view count', fatal=False)), + 'comment_count': traverse_obj( + re.search(r'Comments \((?P\d+)\)', webpage), + ('commentcount', T(int_or_none))), 'age_limit': age_limit, - 'categories': categories, + 'categories': categories or None, 'formats': formats, - } + }, traverse_obj( + re.search(r'hint=[\'"](?P\d+) Likes / (?P\d+) Dislikes', webpage), { + 'like_count': ('likecount', T(int_or_none)), + 'dislike_count': ('dislikecount', T(int_or_none)), + })) class XHamsterEmbedIE(XHamsterBaseIE): @@ -420,6 +387,7 @@ class XHamsterEmbedIE(XHamsterBaseIE): 'timestamp': 1406581861, 'upload_date': '20140728', 'uploader': 'ManyakisArt', + 'uploader_id': 'manyakisart', 'duration': 5, 'age_limit': 18, } @@ -444,7 +412,7 @@ def _real_extract(self, url): vars = self._parse_json( self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), video_id) - video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) + video_url = traverse_obj(vars, 'downloadLink', 'homepageLink', 'commentsLink', 'shareUrl', expected_type=url_or_none) return self.url_result(video_url, 'XHamster') From d912aa011a783b0b8d2ad10f5c9bbe26d87fc86d Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 2 Oct 2023 04:35:32 +0100 Subject: [PATCH 04/11] [test] Only limit playlist test when `playlist_mincount` is the only count tested * eg not when `playlist_count` is specified * avoid `playlist_mincount` if a `lambda` test may test the count --- test/test_download.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index e0bc8cb9542..71708788fc4 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -35,6 +35,7 @@ ExtractorError, error_to_compat_str, format_bytes, + traverse_obj, UnavailableVideoError, ) from youtube_dl.extractor import get_info_extractor @@ -122,7 +123,8 @@ def print_skipping(reason): params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') - params.setdefault('playlistend', test_case.get('playlist_mincount')) + if traverse_obj(test_case, 'playlist_count', 'playlist_maxcount', default=-1) < 0: + params.setdefault('playlistend', test_case.get('playlist_mincount')) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) From ffccff1a1f707b6c95e21ee4c3a0bab07487a467 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 2 Oct 2023 02:54:15 +0100 Subject: [PATCH 05/11] [XHamster] Add extraction for Creator, Category, Search pages * re-factor existing playlist extraction - for a URL with specified page, extract that oage only with `(p{n})` appended to title - otherwise follow next page continuations with `(all)` appended * add XHamsterCreatorIE for Creator/Pornstar/Celebrity pages * add XHamsterCategoryIE for Category pages * add XHamsterSearchIE for search pages with search term as base title * add XHamsterSearchKeyIE to support search with xhsearch[n]: pseudo-URL scheme --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/xhamster.py | 318 ++++++++++++++++++++++++++--- 2 files changed, 296 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d9289e5bf1a..82783b30348 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1602,6 +1602,10 @@ from .xhamster import ( XHamsterIE, XHamsterEmbedIE, + XHamsterCategoryIE, + XHamsterCreatorIE, + XHamsterSearchIE, + XHamsterSearchKeyIE, XHamsterUserIE, ) from .xiami import ( diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index f84d55118b7..65667b4c005 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,7 +4,9 @@ import itertools import re -from .common import InfoExtractor +from math import isinf + +from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_kwargs, compat_urlparse, @@ -417,15 +419,95 @@ def _real_extract(self, url): return self.url_result(video_url, 'XHamster') -class XHamsterUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)' % XHamsterIE._DOMAINS +class XHamsterPlaylistIE(XHamsterBaseIE): + _NEXT_PAGE_RE = r'(]+\bdata-page\s*=\s*["\']next[^>]+>)' + + def _page_url(self, user_id, page_num, url=None): + return self._PAGE_URL_TPL % (user_id, page_num) + + def _extract_entries(self, page, user_id): + for video_tag_match in re.finditer( + r']+class=["\'].*?\bvideo-thumb__image-container[^>]+>', + page): + video_url = traverse_obj(video_tag_match, ( + 0, T(extract_attributes), 'href', T(url_or_none))) + if not video_url or not XHamsterIE.suitable(video_url): + continue + video_id = XHamsterIE._match_id(video_url) + yield self.url_result( + video_url, ie=XHamsterIE.ie_key(), video_id=video_id) + + def _next_page_url(self, page, user_id, page_num): + return traverse_obj( + self._search_regex(self._NEXT_PAGE_RE, page, 'next page', default=None), + (T(extract_attributes), 'href', T(url_or_none))) + + def _entries(self, user_id, page_num=None, page=None, url=None): + page_1 = 1 if page_num is None else page_num + next_page_url = self._page_url(user_id, page_1, url) + for pagenum in itertools.count(page_1): + if not page: + page = self._download_webpage( + next_page_url, user_id, 'Downloading page' + ((' %d' % pagenum) if pagenum > 1 else ''), + fatal=False) + if not page: + break + + for from_ in self._extract_entries(page, user_id): + yield from_ + + if page_num is not None: + break + next_page_url = self._next_page_url(page, user_id, page_num) + if not next_page_url: + break + page = None + + def _fancy_page_url(self, user_id, page_num, url): + sub = self._match_valid_url(url).group('sub') + n_url = self._PAGE_URL_TPL % ( + join_nonempty(user_id, sub, delim='/'), page_num) + return compat_urlparse.urljoin(n_url, url) + + def _fancy_get_title(self, user_id, page_num, url): + sub = self._match_valid_url(url).group('sub') + sub = (sub or '').split('/') + sub.extend((compat_urlparse.urlsplit(url).query or '').split('&')) + sub.append('all' if page_num is None else ('p%d' % page_num)) + return '%s (%s)' % (user_id, join_nonempty(*sub, delim=',')) + + @staticmethod + def _get_title(user_id, page_num, url=None): + return '%s (%s)' % (user_id, 'all' if page_num is None else ('p%d' % page_num)) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user_id = mobj.group('id') + page_num = int_or_none(mobj.groupdict().get('pnum')) + return self.playlist_result( + self._entries(user_id, page_num, url=url), user_id, + self._get_title(user_id, page_num, url=url)) + + +class XHamsterUserIE(XHamsterPlaylistIE): + _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)(?:/videos/(?P\d+))?' % XHamsterIE._DOMAINS + _PAGE_URL_TPL = 'https://xhamster.com/users/%s/videos/%s' _TESTS = [{ # Paginated user profile 'url': 'https://xhamster.com/users/netvideogirls/videos', 'info_dict': { 'id': 'netvideogirls', + 'title': 'netvideogirls (all)', }, 'playlist_mincount': 267, + }, { + # Page from paginated user profile + 'url': 'https://xhamster.com/users/netvideogirls/videos/2', + 'info_dict': { + 'id': 'netvideogirls', + 'title': 'netvideogirls (p2)', + }, + 'playlist_count': 30, }, { # Non-paginated user profile 'url': 'https://xhamster.com/users/firatkaan/videos', @@ -440,29 +522,213 @@ class XHamsterUserIE(InfoExtractor): 'only_matching': True, }] - def _entries(self, user_id): - next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id - for pagenum in itertools.count(1): - page = self._download_webpage( - next_page_url, user_id, 'Downloading page %s' % pagenum) - for video_tag in re.findall( - r'(]+class=["\'].*?\bvideo-thumb__image-container[^>]+>)', - page): - video = extract_attributes(video_tag) - video_url = url_or_none(video.get('href')) - if not video_url or not XHamsterIE.suitable(video_url): - continue - video_id = XHamsterIE._match_id(video_url) - yield self.url_result( - video_url, ie=XHamsterIE.ie_key(), video_id=video_id) - mobj = re.search(r']+data-page=["\']next[^>]+>', page) - if not mobj: - break - next_page = extract_attributes(mobj.group(0)) - next_page_url = url_or_none(next_page.get('href')) - if not next_page_url: - break + +class XHamsterCreatorIE(XHamsterPlaylistIE): + # `pornstars`, `celebrities` and `creators` share the same namespace + _VALID_URL = r'''(?x) + https?://(?:.+?\.)?%s + /(?:(?:gay|shemale)/)?(?:creators|pornstars|celebrities) + /(?P[^/?#]+) + (?:(?P(?:/(?:hd|4k|newest|full-length|exclusive))+))? + (?:/(?P\d+))?(?:[/?#]|$) + ''' % XHamsterIE._DOMAINS + _PAGE_URL_TPL = 'https://xhamster.com/creators/%s/%s' + _TESTS = [{ + # Paginated creator profile + 'url': 'https://xhamster.com/creators/mini-elfie', + 'info_dict': { + 'id': 'mini-elfie', + 'title': 'mini-elfie (all)', + }, + 'playlist_mincount': 70, + }, { + # Paginated pornstar profile + 'url': 'https://xhamster.com/pornstars/mariska-x', + 'info_dict': { + 'id': 'mariska-x', + 'title': 'mariska-x (all)', + }, + 'playlist_mincount': 163, + }, { + # creator profile filtered by path + 'url': 'https://xhamster.com/creators/mini-elfie/4k', + 'info_dict': { + 'id': 'mini-elfie', + 'title': 'mini-elfie (4k,all)', + }, + 'playlist_mincount': 5, + 'playlist_maxcount': 30, + }, { + # creator profile filtered by query + 'url': 'https://xhamster.com/creators/mini-elfie/?category=pov', + 'info_dict': { + 'id': 'mini-elfie', + 'title': 'mini-elfie (category=pov,all)', + }, + 'playlist_mincount': 8, + 'playlist_maxcount': 30, + }] + + def _page_url(self, user_id, page_num, url): + return self._fancy_page_url(user_id, page_num, url) + + def _get_title(self, user_id, page_num, url): + return self._fancy_get_title(user_id, page_num, url) + + +class XHamsterCategoryIE(XHamsterPlaylistIE): + # `tags` and `categories` share the same namespace + _VALID_URL = r'''(?x) + https?://(?:.+?\.)?%s + (?:(?Pgay|shemale)/)?(?:/categories|/tags|(?=/hd)) + /(?P[^/?#]+) + (?P(?:/(?:hd|4k|producer|creator|best(?:/(?:weekly|monthly|year-\d{4}))?))+)? + (?:/(?P\d+))?(?:[/?#]|$) + ''' % XHamsterIE._DOMAINS + _PAGE_URL_TPL = 'https://xhamster.com/categories/%s/%s' + _NEXT_PAGE_RE = r'(]+\bclass\s*=\s*("|\')(?:[\w-]+\s+)*?prev-next-list-link--next(?:\s+[\w-]+)*\2[^>]+>)' + _TESTS = [{ + # Paginated category/tag + 'url': 'https://xhamster.com/tags/hawaiian', + 'info_dict': { + 'id': 'hawaiian', + 'title': 'hawaiian (all)', + }, + 'playlist_mincount': 250, + }, { + # Single page category/tag + 'url': 'https://xhamster.com/categories/aruban', + 'info_dict': { + 'id': 'aruban', + 'title': 'aruban (all)', + }, + 'playlist_mincount': 5, + 'playlist_maxcount': 30, + }, { + # category/tag filtered by path + 'url': 'https://xhamster.com/categories/hawaiian/4k', + 'info_dict': { + 'id': 'hawaiian', + 'title': 'hawaiian (4k,all)', + }, + 'playlist_mincount': 1, + 'playlist_maxcount': 20, + }, { + # category/tag filtered by query + 'url': 'https://xhamster.com/tags/hawaiian?fps=60', + 'info_dict': { + 'id': 'hawaiian', + 'title': 'hawaiian (fps=60,all)', + }, + 'playlist_mincount': 1, + 'playlist_maxcount': 20, + }] + + def _page_url(self, user_id, page_num, url): + queer, sub = self._match_valid_url(url).group('queer', 'sub') + n_url = self._PAGE_URL_TPL % ( + join_nonempty(queer, user_id, sub, delim='/'), page_num) + return compat_urlparse.urljoin(n_url, url) + + def _get_title(self, user_id, page_num, url): + queer, sub = self._match_valid_url(url).group('queer', 'sub') + queer = [] if queer is None else [queer] + sub = queer + (sub or '').split('/') + sub.extend((compat_urlparse.urlsplit(url).query or '').split('&')) + sub.append('all' if page_num is None else ('p%d' % page_num)) + return '%s (%s)' % (user_id, join_nonempty(*sub, delim=',')) + + +class XHamsterSearchIE(XHamsterPlaylistIE): + _VALID_URL = r'''(?x) + https?://(?:.+?\.)?%s + /search/(?P[^/?#]+) + ''' % XHamsterIE._DOMAINS + _TESTS = [{ + # Single page result + 'url': 'https://xhamster.com/search/latvia', + 'info_dict': { + 'id': 'latvia', + 'title': 'latvia (all)', + }, + 'playlist_mincount': 10, + 'playlist_maxcount': 30, + }, { + # Paginated result + 'url': 'https://xhamster.com/search/latvia+estonia+moldova+lithuania', + 'info_dict': { + 'id': 'latvia+estonia+moldova+lithuania', + 'title': 'latvia estonia moldova lithuania (all)', + }, + 'playlist_mincount': 63, + }, { + # Single page of paginated result + 'url': 'https://xhamster.com/search/latvia+estonia+moldova+lithuania?page=2', + 'info_dict': { + 'id': 'latvia+estonia+moldova+lithuania', + 'title': 'latvia estonia moldova lithuania (p2)', + }, + 'playlist_count': 47, + }] + + @staticmethod + def _page_url(user_id, page_num, url): + return url + + def _get_title(self, user_id, page_num, url=None): + return super(XHamsterSearchIE, self)._get_title( + user_id.replace('+', ' '), page_num, url) def _real_extract(self, url): user_id = self._match_id(url) - return self.playlist_result(self._entries(user_id), user_id) + page_num = traverse_obj(url, ( + T(parse_qs), 'page', -1, T(int_or_none))) + return self.playlist_result( + self._entries(user_id, page_num, url=url), user_id, + self._get_title(user_id, page_num)) + + +class XHamsterSearchKeyIE(SearchInfoExtractor, XHamsterSearchIE): + _SEARCH_KEY = 'xhsearch' + _MAX_RESULTS = float('inf') + _TESTS = [{ + # Single page result + 'url': 'xhsearchall:latvia', + 'info_dict': { + 'id': 'latvia', + 'title': 'latvia (all)', + }, + 'playlist_mincount': 10, + 'playlist_maxcount': 30, + }, { + # Paginated result + 'url': 'xhsearchall:latvia estonia moldova lithuania', + 'info_dict': { + 'id': 'latvia+estonia+moldova+lithuania', + 'title': 'latvia estonia moldova lithuania (all)', + }, + 'playlist_mincount': 63, + }, { + # Subset of paginated result + 'url': 'xhsearch50:latvia estonia moldova lithuania', + 'info_dict': { + 'id': 'latvia+estonia+moldova+lithuania', + 'title': 'latvia estonia moldova lithuania (first 50)', + }, + 'playlist_count': 50, + }] + + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + + result = XHamsterSearchIE._real_extract( + self, 'https://xhamster.com/search/' + query.replace(' ', '+')) + + if not isinf(n): + # with the secret knowledge that `result['entries'] is a + # generator, it can be sliced efficiently + result['entries'] = itertools.islice(result['entries'], n) + if result.get('title') is not None: + result['title'] = result['title'].replace('(all)', '(first %d)' % n) + + return result From 3a31e52d278e20c2cd8297c6b68879bd51d69ea3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 4 Oct 2023 00:59:11 +0100 Subject: [PATCH 06/11] [test] pl_counts --- test/test_download.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 71708788fc4..a01fc7f2039 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -123,8 +123,10 @@ def print_skipping(reason): params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') - if traverse_obj(test_case, 'playlist_count', 'playlist_maxcount', default=-1) < 0: - params.setdefault('playlistend', test_case.get('playlist_mincount')) + # only process enough items for specified tests + pl_counts = traverse_obj(test_case, (None, ('playlist_count', 'playlist_mincount', 'playlist_maxcount'))) + if pl_counts: + params.setdefault('playlistend', max(pl_counts) + 1) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) From e6c95bd19280b136e29f4d8a6dd8db24d2c54a17 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 4 Oct 2023 01:07:50 +0100 Subject: [PATCH 07/11] [utils] Add `classproperty()` decorator from yt-dlp --- youtube_dl/utils.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 443d2609c96..ddec962f4c8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6510,3 +6510,33 @@ def join_nonempty(*values, **kwargs): if from_dict is not None: values = (traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(compat_str, filter(None, values))) + + +# from yt-dlp +class classproperty(object): + """property access for class methods with optional caching""" + def __new__(cls, *args, **kwargs): + if 'func' in kwargs: + func = kwargs.pop('func') + elif len(args) > 0: + func = args[0] + args = args[1:] + else: + func = None + if not func: + return functools.partial(cls, *args, **kwargs) + return super(classproperty, cls).__new__(cls) + + def __init__(self, func, **kwargs): + # kw-only arg + cache = kwargs.get('cache', False) + functools.update_wrapper(self, func) + self.func = func + self._cache = {} if cache else None + + def __get__(self, n, cls): + if self._cache is None: + return self.func(cls) + elif cls not in self._cache: + self._cache[cls] = self.func(cls) + return self._cache[cls] From d0762cf36a395af1b39b18b70da07c1727d3812a Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 4 Oct 2023 01:09:29 +0100 Subject: [PATCH 08/11] [utils] Add `classpropinit()` decorator for easier use of inherited class vars --- youtube_dl/utils.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ddec962f4c8..fb2fa41a445 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6540,3 +6540,33 @@ def __get__(self, n, cls): elif cls not in self._cache: self._cache[cls] = self.func(cls) return self._cache[cls] + + +class classpropinit(classproperty): + """ A Python fubar: parent class vars are not in scope when the + `class suite` is evaluated, so disallowing `childvar = fn(parentvar)`. + Instead, the parent class has to be mentioned redundantly and + unmaintainably, since the current class isn't yet bound. + This decorator evaluates a class method and assigns its result + in place of the method. + + class child(parent): + # before + childvar = fn(parent.parentvar) + # now + @classpropinit + def childvar(cls): + return fn(cls.parentvar) + # or + childvar = classpropinit(lambda cls: fn(cls.parentvar)) + """ + + def __init__(self, func): + functools.update_wrapper(self, func) + self.name = func.__name__ + self.func = func + + def __get__(self, _, cls): + val = self.func(cls) + setattr(cls, self.name, val) + return val From 44a30c6d3a0358596347e800f353c272c75922d3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 4 Oct 2023 01:33:39 +0100 Subject: [PATCH 09/11] [XHamster] Move domain list to base class and introduce classpropinit --- youtube_dl/extractor/xhamster.py | 90 ++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 65667b4c005..8a37a5d2eff 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -12,6 +12,7 @@ compat_urlparse, ) from ..utils import ( + classpropinit, clean_html, determine_ext, extract_attributes, @@ -32,6 +33,18 @@ class XHamsterBaseIE(InfoExtractor): + # base domains that don't redirect to xhamster.com (not xhday\d\.com, eg) + _DOMAINS = '(?:%s)' % '|'.join(( + r'xhamster\d*\.(?:com|desi)', + r'xhamster\.one', + r'xhms\.pro', + r'xh(?:open|access|victory|big|channel)\.com', + r'(?:full|mega)xh\.com', + r'xh(?:vid|official|planet)\d*\.com', + # requires Tor + r'xhamster[a-z2-7]+\.onion', + )) + def _download_webpage_handle(self, url, video_id, *args, **kwargs): # note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None) # default UA to 'Mozilla' (only) to avoid interstitial page @@ -53,25 +66,16 @@ def _download_webpage_handle(self, url, video_id, *args, **kwargs): class XHamsterIE(XHamsterBaseIE): - # base domains that don't redirect to xhamster.com (not xhday\d\.com, eg) - _DOMAINS = '(?:%s)' % '|'.join(( - r'xhamster\d*\.(?:com|desi)', - r'xhamster\.one', - r'xhms\.pro', - r'xh(?:open|access|victory|big|channel)\.com', - r'(?:full|mega)xh\.com', - r'xh(?:vid|official|planet)\d*\.com', - # requires Tor - r'xhamster[a-z2-7]+\.onion', - )) - _VALID_URL = r'''(?x) - https?:// - (?:.+?\.)?%s/ - (?: - movies/(?P[\dA-Za-z]+)/(?P[^/]*)\.html| - videos/(?P[^/]*)-(?P[\dA-Za-z]+) - ) - ''' % _DOMAINS + _VALID_URL = classpropinit( + lambda cls: + r'''(?x) + https?:// + (?:.+?\.)?%s/ + (?: + movies/(?P[\dA-Za-z]+)/(?P[^/]*)\.html| + videos/(?P[^/]*)-(?P[\dA-Za-z]+) + ) + ''' % cls._DOMAINS) _TESTS = [{ 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', 'md5': '34e1ab926db5dc2750fed9e1f34304bb', @@ -379,7 +383,9 @@ def _old_real_extract(self, webpage, video_id, display_id, age_limit): class XHamsterEmbedIE(XHamsterBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS + _VALID_URL = classpropinit( + lambda cls: + r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % cls._DOMAINS) _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { @@ -421,6 +427,12 @@ def _real_extract(self, url): class XHamsterPlaylistIE(XHamsterBaseIE): _NEXT_PAGE_RE = r'(]+\bdata-page\s*=\s*["\']next[^>]+>)' + _VALID_URL_TPL = r'''(?x) + https?://(?:.+?\.)?%s + /%s/(?P[^/?#]+) + (?:(?P(?:/%s)+))? + (?:/(?P\d+))?(?:[/?#]|$) + ''' def _page_url(self, user_id, page_num, url=None): return self._PAGE_URL_TPL % (user_id, page_num) @@ -525,13 +537,13 @@ class XHamsterUserIE(XHamsterPlaylistIE): class XHamsterCreatorIE(XHamsterPlaylistIE): # `pornstars`, `celebrities` and `creators` share the same namespace - _VALID_URL = r'''(?x) - https?://(?:.+?\.)?%s - /(?:(?:gay|shemale)/)?(?:creators|pornstars|celebrities) - /(?P[^/?#]+) - (?:(?P(?:/(?:hd|4k|newest|full-length|exclusive))+))? - (?:/(?P\d+))?(?:[/?#]|$) - ''' % XHamsterIE._DOMAINS + _VALID_URL = classpropinit( + lambda cls: + cls._VALID_URL_TPL % ( + cls._DOMAINS, + '(?:(?:gay|shemale)/)?(?:creators|pornstars|celebrities)', + r'(?:hd|4k|newest|full-length|exclusive|best(?:/(?:weekly|monthly|year-\d{4}))?)', + )) _PAGE_URL_TPL = 'https://xhamster.com/creators/%s/%s' _TESTS = [{ # Paginated creator profile @@ -578,13 +590,13 @@ def _get_title(self, user_id, page_num, url): class XHamsterCategoryIE(XHamsterPlaylistIE): # `tags` and `categories` share the same namespace - _VALID_URL = r'''(?x) - https?://(?:.+?\.)?%s - (?:(?Pgay|shemale)/)?(?:/categories|/tags|(?=/hd)) - /(?P[^/?#]+) - (?P(?:/(?:hd|4k|producer|creator|best(?:/(?:weekly|monthly|year-\d{4}))?))+)? - (?:/(?P\d+))?(?:[/?#]|$) - ''' % XHamsterIE._DOMAINS + _VALID_URL = classpropinit( + lambda cls: + cls._VALID_URL_TPL % ( + cls._DOMAINS, + '(?:(?Pgay|shemale)/)?(?:categories|tags|(?=hd))', + r'(?:hd|4k|producer|creator|best(?:/(?:weekly|monthly|year-\d{4}))?)', + )) _PAGE_URL_TPL = 'https://xhamster.com/categories/%s/%s' _NEXT_PAGE_RE = r'(]+\bclass\s*=\s*("|\')(?:[\w-]+\s+)*?prev-next-list-link--next(?:\s+[\w-]+)*\2[^>]+>)' _TESTS = [{ @@ -640,10 +652,12 @@ def _get_title(self, user_id, page_num, url): class XHamsterSearchIE(XHamsterPlaylistIE): - _VALID_URL = r'''(?x) - https?://(?:.+?\.)?%s - /search/(?P[^/?#]+) - ''' % XHamsterIE._DOMAINS + _VALID_URL = classpropinit( + lambda cls: + r'''(?x) + https?://(?:.+?\.)?%s + /search/(?P[^/?#]+) + ''' % cls._DOMAINS) _TESTS = [{ # Single page result 'url': 'https://xhamster.com/search/latvia', From 71aae1d7953b7d9301dc5e855c8aeb95410c60e8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 4 Oct 2023 01:40:04 +0100 Subject: [PATCH 10/11] [XHamster] Add extraction of user's `favorites` --- youtube_dl/extractor/xhamster.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 8a37a5d2eff..cff11b7a965 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -502,7 +502,9 @@ def _real_extract(self, url): class XHamsterUserIE(XHamsterPlaylistIE): - _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)(?:/videos/(?P\d+))?' % XHamsterIE._DOMAINS + _VALID_URL = classpropinit( + lambda cls: + r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)(?P/favorites)?(?:/videos/(?P\d+))?' % cls._DOMAINS) _PAGE_URL_TPL = 'https://xhamster.com/users/%s/videos/%s' _TESTS = [{ # Paginated user profile @@ -525,10 +527,27 @@ class XHamsterUserIE(XHamsterPlaylistIE): 'url': 'https://xhamster.com/users/firatkaan/videos', 'info_dict': { 'id': 'firatkaan', + 'title': 'firatkaan (all)', }, 'playlist_mincount': 1, }, { - # the below doesn't match but is redirected via generic + # User with `favorites` + 'url': 'https://xhamster.com/users/cubafidel/videos/', + 'info_dict': { + 'id': 'cubafidel', + 'title': 'cubafidel (all)', + }, + 'playlist_maxcount': 300, + }, { + # Faves of user with `favorites` + 'url': 'https://xhamster.com/users/cubafidel/favorites/videos/', + 'info_dict': { + 'id': 'cubafidel', + 'title': 'cubafidel (favorites,all)', + }, + 'playlist_mincount': 400, + }, { + # below URL doesn't match but is redirected via generic # 'url': 'https://xhday.com/users/mobhunter', 'url': 'https://xhvid.com/users/pelushe21', 'only_matching': True, From b2b622a9b52cae9ae856755068ad99308e624e11 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 4 Oct 2023 01:41:35 +0100 Subject: [PATCH 11/11] [XHamster] Add channel extraction --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/xhamster.py | 129 +++++++++++++++++++---------- 2 files changed, 85 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 82783b30348..c49cd3b384f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1603,6 +1603,7 @@ XHamsterIE, XHamsterEmbedIE, XHamsterCategoryIE, + XHamsterChannelIE, XHamsterCreatorIE, XHamsterSearchIE, XHamsterSearchKeyIE, diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index cff11b7a965..917a8c490a5 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -23,6 +23,7 @@ merge_dicts, parse_duration, parse_qs, + remove_start, T, traverse_obj, txt_or_none, @@ -434,8 +435,12 @@ class XHamsterPlaylistIE(XHamsterBaseIE): (?:/(?P\d+))?(?:[/?#]|$) ''' - def _page_url(self, user_id, page_num, url=None): - return self._PAGE_URL_TPL % (user_id, page_num) + def _page_url(self, user_id, subs, page_num, url): + n_url = self._PAGE_URL_TPL % ( + join_nonempty(user_id, *subs, delim='/'), page_num) + n_url = compat_urlparse.urlsplit(n_url) + url = compat_urlparse.urlsplit(url) + return compat_urlparse.urlunsplit(n_url[:3] + url[3:]) def _extract_entries(self, page, user_id): for video_tag_match in re.finditer( @@ -454,9 +459,9 @@ def _next_page_url(self, page, user_id, page_num): self._search_regex(self._NEXT_PAGE_RE, page, 'next page', default=None), (T(extract_attributes), 'href', T(url_or_none))) - def _entries(self, user_id, page_num=None, page=None, url=None): + def _entries(self, user_id, subs, page_num=None, page=None, url=None): page_1 = 1 if page_num is None else page_num - next_page_url = self._page_url(user_id, page_1, url) + next_page_url = self._page_url(user_id, subs, page_1, url) for pagenum in itertools.count(page_1): if not page: page = self._download_webpage( @@ -475,30 +480,22 @@ def _entries(self, user_id, page_num=None, page=None, url=None): break page = None - def _fancy_page_url(self, user_id, page_num, url): - sub = self._match_valid_url(url).group('sub') - n_url = self._PAGE_URL_TPL % ( - join_nonempty(user_id, sub, delim='/'), page_num) - return compat_urlparse.urljoin(n_url, url) - - def _fancy_get_title(self, user_id, page_num, url): - sub = self._match_valid_url(url).group('sub') - sub = (sub or '').split('/') - sub.extend((compat_urlparse.urlsplit(url).query or '').split('&')) - sub.append('all' if page_num is None else ('p%d' % page_num)) - return '%s (%s)' % (user_id, join_nonempty(*sub, delim=',')) - @staticmethod - def _get_title(user_id, page_num, url=None): - return '%s (%s)' % (user_id, 'all' if page_num is None else ('p%d' % page_num)) + def _get_title(user_id, subs, page_num, url): + subs = subs[:] + if url: + subs.extend((compat_urlparse.urlsplit(url).query or '').split('&')) + subs.append('all' if page_num is None else ('p%d' % page_num)) + return '%s (%s)' % (user_id, join_nonempty(*subs, delim=',')) def _real_extract(self, url): - mobj = self._match_valid_url(url) - user_id = mobj.group('id') - page_num = int_or_none(mobj.groupdict().get('pnum')) + mobj = self._match_valid_url(url).groupdict() + user_id = mobj['id'] + page_num = int_or_none(mobj.get('pnum')) + subs = remove_start(mobj.get('sub') or '', '/').split('/') return self.playlist_result( - self._entries(user_id, page_num, url=url), user_id, - self._get_title(user_id, page_num, url=url)) + self._entries(user_id, subs, page_num, url=url), user_id, + self._get_title(user_id, subs, page_num, url=url)) class XHamsterUserIE(XHamsterPlaylistIE): @@ -600,14 +597,57 @@ class XHamsterCreatorIE(XHamsterPlaylistIE): 'playlist_maxcount': 30, }] - def _page_url(self, user_id, page_num, url): - return self._fancy_page_url(user_id, page_num, url) - def _get_title(self, user_id, page_num, url): - return self._fancy_get_title(user_id, page_num, url) +class XHamsterChannelBaseIE(XHamsterPlaylistIE): + _NEXT_PAGE_RE = r'(]+\bclass\s*=\s*("|\')(?:[\w-]+\s+)*?prev-next-list-link--next(?:\s+[\w-]+)*\2[^>]+>)' -class XHamsterCategoryIE(XHamsterPlaylistIE): +class XHamsterChannelIE(XHamsterChannelBaseIE): + _VALID_URL = classpropinit( + lambda cls: + cls._VALID_URL_TPL % ( + cls._DOMAINS, + '(?:(?:gay|shemale)/)?channels', + r'(?:hd|4k|newest|full-length|best(?:/(?:weekly|monthly|year-\d{4}))?)', + )) + _PAGE_URL_TPL = 'https://xhamster.com/channels/%s/%s' + _TESTS = [{ + # Paginated channel + 'url': 'https://xhamster.com/channels/freeuse-fantasy', + 'info_dict': { + 'id': 'freeuse-fantasy', + 'title': 'freeuse-fantasy (all)', + }, + 'playlist_mincount': 90, + }, { + # Non-paginated channel (for now?) + 'url': 'https://xhamster.com/channels/oopsie', + 'info_dict': { + 'id': 'oopsie', + 'title': 'oopsie (all)', + }, + 'playlist_mincount': 30, + 'playlist_maxcount': 48, + }, { + # Channel filtered by path + 'url': 'https://xhamster.com/channels/freeuse-fantasy/best/year-2022', + 'info_dict': { + 'id': 'freeuse-fantasy', + 'title': 'freeuse-fantasy (best,year-2022,all)', + }, + 'playlist_count': 30, + }, { + # Channel filtered by query + 'url': 'https://xhamster.com/channels/freeuse-fantasy?min-duration=40', + 'info_dict': { + 'id': 'freeuse-fantasy', + 'title': 'freeuse-fantasy (min-duration=40,all)', + }, + 'playlist_maxcount': 10, + }] + + +class XHamsterCategoryIE(XHamsterChannelBaseIE): # `tags` and `categories` share the same namespace _VALID_URL = classpropinit( lambda cls: @@ -617,7 +657,6 @@ class XHamsterCategoryIE(XHamsterPlaylistIE): r'(?:hd|4k|producer|creator|best(?:/(?:weekly|monthly|year-\d{4}))?)', )) _PAGE_URL_TPL = 'https://xhamster.com/categories/%s/%s' - _NEXT_PAGE_RE = r'(]+\bclass\s*=\s*("|\')(?:[\w-]+\s+)*?prev-next-list-link--next(?:\s+[\w-]+)*\2[^>]+>)' _TESTS = [{ # Paginated category/tag 'url': 'https://xhamster.com/tags/hawaiian', @@ -655,19 +694,19 @@ class XHamsterCategoryIE(XHamsterPlaylistIE): 'playlist_maxcount': 20, }] - def _page_url(self, user_id, page_num, url): - queer, sub = self._match_valid_url(url).group('queer', 'sub') + def _page_url(self, user_id, subs, page_num, url): + queer = self._match_valid_url(url).group('queer') n_url = self._PAGE_URL_TPL % ( - join_nonempty(queer, user_id, sub, delim='/'), page_num) + join_nonempty(queer, user_id, *subs, delim='/'), page_num) return compat_urlparse.urljoin(n_url, url) - def _get_title(self, user_id, page_num, url): - queer, sub = self._match_valid_url(url).group('queer', 'sub') - queer = [] if queer is None else [queer] - sub = queer + (sub or '').split('/') - sub.extend((compat_urlparse.urlsplit(url).query or '').split('&')) - sub.append('all' if page_num is None else ('p%d' % page_num)) - return '%s (%s)' % (user_id, join_nonempty(*sub, delim=',')) + def _get_title(self, user_id, subs, page_num, url): + queer = self._match_valid_url(url).group('queer') + if queer: + subs = [queer] + subs + subs.extend((compat_urlparse.urlsplit(url).query or '').split('&')) + subs.append('all' if page_num is None else ('p%d' % page_num)) + return '%s (%s)' % (user_id, join_nonempty(*subs, delim=',')) class XHamsterSearchIE(XHamsterPlaylistIE): @@ -705,20 +744,20 @@ class XHamsterSearchIE(XHamsterPlaylistIE): }] @staticmethod - def _page_url(user_id, page_num, url): + def _page_url(user_id, subs, page_num, url): return url - def _get_title(self, user_id, page_num, url=None): + def _get_title(self, user_id, subs, page_num, url=None): return super(XHamsterSearchIE, self)._get_title( - user_id.replace('+', ' '), page_num, url) + user_id.replace('+', ' '), [], page_num, url) def _real_extract(self, url): user_id = self._match_id(url) page_num = traverse_obj(url, ( T(parse_qs), 'page', -1, T(int_or_none))) return self.playlist_result( - self._entries(user_id, page_num, url=url), user_id, - self._get_title(user_id, page_num)) + self._entries(user_id, None, page_num, url=url), user_id, + self._get_title(user_id, None, page_num)) class XHamsterSearchKeyIE(SearchInfoExtractor, XHamsterSearchIE):