Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
195f22f679

Closes #3716
Authored by: Grub4k, pukkandan
This commit is contained in:
Simon Sawicki 2023-01-02 14:45:36 +01:00 committed by GitHub
parent 8300774c4a
commit 32a84bcf4e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 529 additions and 167 deletions

View file

@ -1872,6 +1872,11 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .thisvid import (
ThisVidIE,
ThisVidMemberIE,
ThisVidPlaylistIE,
)
from .threespeak import ( from .threespeak import (
ThreeSpeakIE, ThreeSpeakIE,
ThreeSpeakUserIE, ThreeSpeakUserIE,

View file

@ -1396,10 +1396,16 @@ class InfoExtractor:
# And then there are the jokers who advertise that they use RTA, but actually don't. # And then there are the jokers who advertise that they use RTA, but actually don't.
AGE_LIMIT_MARKERS = [ AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
r'>[^<]*you acknowledge you are at least (\d+) years old',
r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
] ]
if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
return 18 age_limit = 0
return 0 for marker in AGE_LIMIT_MARKERS:
mobj = re.search(marker, html)
if mobj:
age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
return age_limit
def _media_rating_search(self, html): def _media_rating_search(self, html):
# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
@ -3216,7 +3222,7 @@ class InfoExtractor:
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search( mobj = re.search(
r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
webpage) webpage)
if mobj: if mobj:
try: try:
@ -3237,19 +3243,20 @@ class InfoExtractor:
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
# JWPlayer backward compatibility: flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
if 'playlist' not in jwplayer_data:
jwplayer_data = {'playlist': [jwplayer_data]}
entries = [] entries = []
if not isinstance(jwplayer_data, dict):
return entries
# JWPlayer backward compatibility: single playlist item playlist_items = jwplayer_data.get('playlist')
# JWPlayer backward compatibility: single playlist item/flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
if not isinstance(jwplayer_data['playlist'], list): # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
jwplayer_data['playlist'] = [jwplayer_data['playlist']] if not isinstance(playlist_items, list):
playlist_items = (playlist_items or jwplayer_data, )
for video_data in jwplayer_data['playlist']: for video_data in playlist_items:
if not isinstance(video_data, dict):
continue
# JWPlayer backward compatibility: flattened sources # JWPlayer backward compatibility: flattened sources
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
if 'sources' not in video_data: if 'sources' not in video_data:
@ -3287,6 +3294,13 @@ class InfoExtractor:
'timestamp': int_or_none(video_data.get('pubdate')), 'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles, 'subtitles': subtitles,
'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
'genre': clean_html(video_data.get('genre')),
'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
'season_number': int_or_none(video_data.get('season')),
'episode_number': int_or_none(video_data.get('episode')),
'release_year': int_or_none(video_data.get('releasedate')),
'age_limit': int_or_none(video_data.get('age_restriction')),
} }
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
@ -3304,7 +3318,7 @@ class InfoExtractor:
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
urls = [] urls = set()
formats = [] formats = []
for source in jwplayer_sources_data: for source in jwplayer_sources_data:
if not isinstance(source, dict): if not isinstance(source, dict):
@ -3313,14 +3327,14 @@ class InfoExtractor:
base_url, self._proto_relative_url(source.get('file'))) base_url, self._proto_relative_url(source.get('file')))
if not source_url or source_url in urls: if not source_url or source_url in urls:
continue continue
urls.append(source_url) urls.add(source_url)
source_type = source.get('type') or '' source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url) ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8': if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native', source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False)) m3u8_id=m3u8_id, fatal=False))
elif source_type == 'dash' or ext == 'mpd': elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
formats.extend(self._extract_mpd_formats( formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False)) source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil': elif ext == 'smil':
@ -3335,13 +3349,12 @@ class InfoExtractor:
'ext': ext, 'ext': ext,
}) })
else: else:
format_id = str_or_none(source.get('label'))
height = int_or_none(source.get('height')) height = int_or_none(source.get('height'))
if height is None: if height is None and format_id:
# Often no height is provided but there is a label in # Often no height is provided but there is a label in
# format like "1080p", "720p SD", or 1080. # format like "1080p", "720p SD", or 1080.
height = int_or_none(self._search_regex( height = parse_resolution(format_id).get('height')
r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
'height', default=None))
a_format = { a_format = {
'url': source_url, 'url': source_url,
'width': int_or_none(source.get('width')), 'width': int_or_none(source.get('width')),
@ -3349,6 +3362,7 @@ class InfoExtractor:
'tbr': int_or_none(source.get('bitrate'), scale=1000), 'tbr': int_or_none(source.get('bitrate'), scale=1000),
'filesize': int_or_none(source.get('filesize')), 'filesize': int_or_none(source.get('filesize')),
'ext': ext, 'ext': ext,
'format_id': format_id
} }
if source_url.startswith('rtmp'): if source_url.startswith('rtmp'):
a_format['ext'] = 'flv' a_format['ext'] = 'flv'

View file

@ -32,6 +32,7 @@ from ..utils import (
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
urljoin,
variadic, variadic,
xpath_attr, xpath_attr,
xpath_text, xpath_text,
@ -1867,11 +1868,13 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july', 'display_id': 'kelis-4th-of-july',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Kelis - 4th Of July', 'title': 'Kelis - 4th Of July',
'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', 'description': 'Kelis - 4th Of July',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Untested major version'],
}, { }, {
# KVS Player # KVS Player
'url': 'https://www.kvs-demo.com/embed/105/', 'url': 'https://www.kvs-demo.com/embed/105/',
@ -1880,35 +1883,12 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july', 'display_id': 'kelis-4th-of-july',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player', 'title': 'Kelis - 4th Of July / Embed Player',
'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# KVS Player
'url': 'https://thisvid.com/videos/french-boy-pantsed/',
'md5': '3397979512c682f6b85b3b04989df224',
'info_dict': {
'id': '2400174',
'display_id': 'french-boy-pantsed',
'ext': 'mp4',
'title': 'French Boy Pantsed - ThisVid.com',
'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
}
}, {
# KVS Player
'url': 'https://thisvid.com/embed/2400174/',
'md5': '3397979512c682f6b85b3b04989df224',
'info_dict': {
'id': '2400174',
'display_id': 'french-boy-pantsed',
'ext': 'mp4',
'title': 'French Boy Pantsed - ThisVid.com',
'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
}
}, {
# KVS Player
'url': 'https://youix.com/video/leningrad-zoj/', 'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': { 'info_dict': {
@ -1916,8 +1896,8 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj', 'display_id': 'leningrad-zoj',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
} },
}, { }, {
# KVS Player # KVS Player
'url': 'https://youix.com/embed/18485', 'url': 'https://youix.com/embed/18485',
@ -1927,19 +1907,20 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj', 'display_id': 'leningrad-zoj',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Ленинград - ЗОЖ', 'title': 'Ленинград - ЗОЖ',
'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
} },
}, { }, {
# KVS Player # KVS Player
'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51', 'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': { 'info_dict': {
'id': '21217', 'id': '21217',
'display_id': '40-nochey-40-nights-2016', 'display_id': '40-nochey-2016',
'ext': 'mp4', 'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org', 'title': '40 ночей (2016) - BogMedia.org',
'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
} },
}, },
{ {
# KVS Player (for sites that serve kt_player.js via non-https urls) # KVS Player (for sites that serve kt_player.js via non-https urls)
@ -1949,9 +1930,9 @@ class GenericIE(InfoExtractor):
'id': '389508', 'id': '389508',
'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
} },
}, },
{ {
# Reddit-hosted video that will redirect and be processed by RedditIE # Reddit-hosted video that will redirect and be processed by RedditIE
@ -2169,7 +2150,20 @@ class GenericIE(InfoExtractor):
'direct': True, 'direct': True,
'age_limit': 0, 'age_limit': 0,
} }
} },
{
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': {
'id': '284002',
'display_id': 'just-out-of-the-shower-joi',
'ext': 'mp4',
'title': 'Just Out Of The Shower JOI - Shooshtime',
'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg',
'height': 720,
'age_limit': 18,
},
},
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):
@ -2235,43 +2229,87 @@ class GenericIE(InfoExtractor):
'entries': entries, 'entries': entries,
} }
def _kvs_getrealurl(self, video_url, license_code): @classmethod
def _kvs_get_real_url(cls, video_url, license_code):
if not video_url.startswith('function/0/'): if not video_url.startswith('function/0/'):
return video_url # not obfuscated return video_url # not obfuscated
url_path, _, url_query = video_url.partition('?') parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
urlparts = url_path.split('/')[2:] license = cls._kvs_get_license_token(license_code)
license = self._kvs_getlicensetoken(license_code) urlparts = parsed.path.split('/')
newmagic = urlparts[5][:32]
for o in range(len(newmagic) - 1, -1, -1): HASH_LENGTH = 32
new = '' hash = urlparts[3][:HASH_LENGTH]
l = (o + sum(int(n) for n in license[o:])) % 32 indices = list(range(HASH_LENGTH))
for i in range(0, len(newmagic)): # Swap indices of hash according to the destination calculated from the license token
if i == o: accum = 0
new += newmagic[l] for src in reversed(range(HASH_LENGTH)):
elif i == l: accum += license[src]
new += newmagic[o] dest = (src + accum) % HASH_LENGTH
else: indices[src], indices[dest] = indices[dest], indices[src]
new += newmagic[i]
newmagic = new
urlparts[5] = newmagic + urlparts[5][32:] urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
return '/'.join(urlparts) + '?' + url_query return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
def _kvs_getlicensetoken(self, license): @staticmethod
modlicense = license.replace('$', '').replace('0', '1') def _kvs_get_license_token(license):
center = int(len(modlicense) / 2) license = license.replace('$', '')
license_values = [int(char) for char in license]
modlicense = license.replace('0', '1')
center = len(modlicense) // 2
fronthalf = int(modlicense[:center + 1]) fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:]) backhalf = int(modlicense[center:])
modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
modlicense = str(4 * abs(fronthalf - backhalf)) return [
retval = '' (license_values[index + offset] + current) % 10
for o in range(0, center + 1): for index, current in enumerate(map(int, modlicense))
for i in range(1, 5): for offset in range(4)
retval += str((int(license[o + i]) + int(modlicense[o])) % 10) ]
return retval
def _extract_kvs(self, url, webpage, video_id):
flashvars = self._search_json(
r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
webpage, 'flashvars', video_id, transform_source=js_to_json)
# extract the part after the last / as the display_id from the
# canonical URL.
display_id = self._search_regex(
r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False)
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
formats = []
for key in url_keys:
if '/get_file/' not in flashvars[key]:
continue
format_id = flashvars.get(f'{key}_text', key)
formats.append({
'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
'format_id': format_id,
'ext': 'mp4',
**(parse_resolution(format_id) or parse_resolution(flashvars[key])),
'http_headers': {'Referer': url},
})
if not formats[-1].get('height'):
formats[-1]['quality'] = 1
return {
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
def _real_extract(self, url): def _real_extract(self, url):
if url.startswith('//'): if url.startswith('//'):
@ -2580,6 +2618,17 @@ class GenericIE(InfoExtractor):
self.report_detected('video.js embed') self.report_detected('video.js embed')
return [{'formats': formats, 'subtitles': subtitles}] return [{'formats': formats, 'subtitles': subtitles}]
# Look for generic KVS player (before json-ld bc of some urls that break otherwise)
found = self._search_regex((
r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
), webpage, 'KVS player', group='ver', default=False)
if found:
self.report_detected('KWS Player')
if found.split('.')[0] not in ('4', '5', '6'):
self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
return [self._extract_kvs(url, webpage, video_id)]
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={}) json_ld = self._search_json_ld(webpage, video_id, default={})
if json_ld.get('url') not in (url, None): if json_ld.get('url') not in (url, None):
@ -2622,52 +2671,6 @@ class GenericIE(InfoExtractor):
['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
if found: if found:
self.report_detected('JW Player embed') self.report_detected('JW Player embed')
if not found:
# Look for generic KVS player
found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
if found:
self.report_detected('KWS Player')
if found.group('maj_ver') not in ['4', '5']:
self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
# extract the part after the last / as the display_id from the
# canonical URL.
display_id = self._search_regex(
r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False
)
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
formats = []
for key in url_keys:
if '/get_file/' not in flashvars[key]:
continue
format_id = flashvars.get(f'{key}_text', key)
formats.append({
'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
'format_id': format_id,
'ext': 'mp4',
**(parse_resolution(format_id) or parse_resolution(flashvars[key]))
})
if not formats[-1].get('height'):
formats[-1]['quality'] = 1
return [{
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}]
if not found: if not found:
# Broaden the search a little bit # Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))

View file

@ -1,71 +1,128 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
ExtractorError,
get_element_by_class,
int_or_none,
merge_dicts,
url_or_none,
)
class PeekVidsIE(InfoExtractor): class PeekVidsBaseIE(InfoExtractor):
def _real_extract(self, url):
domain, video_id = self._match_valid_url(url).group('domain', 'id')
webpage = self._download_webpage(url, video_id, expected_status=429)
if '>Rate Limit Exceeded' in webpage:
raise ExtractorError(
f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}',
video_id=video_id, expected=True)
title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title')
display_id = video_id
video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID')
srcs = self._download_json(
f'https://www.{domain}/v-alt/{video_id}', video_id,
note='Downloading list of source files')
formats = []
for k, v in srcs.items():
f_url = url_or_none(v)
if not f_url:
continue
height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None)
if not height:
continue
formats.append({
'url': f_url,
'format_id': height,
'height': int_or_none(height),
})
if not formats:
formats = [{'url': url} for url in srcs.values()]
info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={})
info.pop('url', None)
# may not have found the thumbnail if it was in a list in the ld+json
info.setdefault('thumbnail', self._og_search_thumbnail(webpage))
detail = (get_element_by_class('detail-video-block', webpage)
or get_element_by_class('detail-block', webpage) or '')
info['description'] = self._html_search_regex(
rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|<ul\b)',
detail, 'description', default=None) or None
info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url)
def cat_tags(name, html):
l = self._html_search_regex(
rf'(?s)<span\b[^>]*>\s*{re.escape(name)}\s*:\s*</span>(.+?)</li>',
html, name, default='')
return list(filter(None, re.split(r'\s+', l)))
return merge_dicts({
'id': video_id,
'display_id': display_id,
'age_limit': 18,
'formats': formats,
'categories': cat_tags('Categories', detail),
'tags': cat_tags('Tags', detail),
'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None),
}, info)
class PeekVidsIE(PeekVidsBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?://(?:www\.)?peekvids\.com/ https?://(?:www\.)?(?P<domain>peekvids\.com)/
(?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=)
(?P<id>[^/?&#]*) (?P<id>[^/?&#]*)
''' '''
_TESTS = [{ _TESTS = [{
'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd',
'md5': 'a00940646c428e232407e3e62f0e8ef5', 'md5': '2ff6a357a9717dc9dc9894b51307e9a2',
'info_dict': { 'info_dict': {
'id': 'BSyLMbN0YCd', 'id': '1262717',
'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', 'display_id': 'BSyLMbN0YCd',
'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com', 'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
'timestamp': 1642579329, 'timestamp': 1642579329,
'upload_date': '20220119', 'upload_date': '20220119',
'duration': 416, 'duration': 416,
'view_count': int, 'view_count': int,
'age_limit': 18, 'age_limit': 18,
'uploader': 'SEXYhub.com',
'categories': list,
'tags': list,
}, },
}] }]
_DOMAIN = 'www.peekvids.com'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID')
srcs = self._download_json(
f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id,
note='Downloading list of source files')
formats = [{
'url': url,
'ext': 'mp4',
'format_id': name[8:],
} for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')]
if not formats:
formats = [{'url': url} for url in srcs.values()]
info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
info.update({
'id': video_id,
'age_limit': 18,
'formats': formats,
})
return info
class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE class PlayVidsIE(PeekVidsBaseIE):
_VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)' _VALID_URL = r'https?://(?:www\.)?(?P<domain>playvids\.com)/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
'md5': 'cd7dfd8a2e815a45402369c76e3c1825', 'md5': '2f12e50213dd65f142175da633c4564c',
'info_dict': { 'info_dict': {
'id': 'U3pBrYhsjXM', 'id': '1978030',
'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', 'display_id': 'U3pBrYhsjXM',
'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com', 'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
'timestamp': 1640435839, 'timestamp': 1640435839,
'upload_date': '20211225', 'upload_date': '20211225',
'duration': 416, 'duration': 416,
'view_count': int, 'view_count': int,
'age_limit': 18, 'age_limit': 18,
'uploader': 'SEXYhub.com',
'categories': list,
'tags': list,
}, },
}, { }, {
'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
@ -73,5 +130,62 @@ class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE
}, { }, {
'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line',
'md5': 'e783986e596cafbf46411a174ab42ba6',
'info_dict': {
'id': '762385',
'display_id': 'bKmGLe3IwjZ',
'ext': 'mp4',
'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6',
'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef',
'timestamp': 1516958544,
'upload_date': '20180126',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 480,
'uploader': 'Brazzers',
'age_limit': 18,
'view_count': int,
'age_limit': 18,
'categories': list,
'tags': list,
},
}, {
'url': 'https://www.playvids.com/v/47iUho33toY',
'md5': 'b056b5049d34b648c1e86497cf4febce',
'info_dict': {
'id': '700621',
'display_id': '47iUho33toY',
'ext': 'mp4',
'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE',
'description': None,
'timestamp': 1507052209,
'upload_date': '20171003',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 332,
'uploader': 'Cacerenele',
'age_limit': 18,
'view_count': int,
'categories': list,
'tags': list,
},
}, {
'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances',
'md5': 'efa09be9f031314b7b7e3bc6510cd0df',
'info_dict': {
'id': '1523518',
'display_id': 'z3_7iwWCmqt',
'ext': 'mp4',
'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances',
'description': None,
'timestamp': 1607470323,
'upload_date': '20201208',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 593,
'uploader': 'yorours',
'age_limit': 18,
'view_count': int,
'categories': list,
'tags': list,
},
}] }]
_DOMAIN = 'www.playvids.com'

226
yt_dlp/extractor/thisvid.py Normal file
View file

@ -0,0 +1,226 @@
import itertools
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
int_or_none,
url_or_none,
urljoin,
)
class ThisVidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'description': 'md5:372353bb995883d1b65fddf507489acd',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'display_id': 'sitting-on-ball-tight-jeans',
'age_limit': 18,
}
}, {
'url': 'https://thisvid.com/embed/3533241/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'display_id': 'sitting-on-ball-tight-jeans',
'age_limit': 18,
}
}]
def _real_extract(self, url):
main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
webpage = self._download_webpage(url, main_id)
title = self._html_search_regex(
r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
webpage, 'title')
if type_ == 'embed':
# look for more metadata
video_alt_url = url_or_none(self._search_regex(
rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''',
webpage, 'video_alt_url', default=None))
if video_alt_url and video_alt_url != url:
webpage = self._download_webpage(
video_alt_url, main_id,
note='Redirecting embed to main page', fatal=False) or webpage
video_holder = get_element_by_class('video-holder', webpage) or ''
if '>This video is a private video' in video_holder:
self.raise_login_required(
(clean_html(video_holder) or 'Private video').partition('\n')[0])
uploader = self._html_search_regex(
r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
webpage, 'uploader', default='')
uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
if len(uploader) == 2:
# id must be non-empty, uploader could be ''
uploader_id, uploader = uploader
uploader = uploader or None
else:
uploader_id = uploader = None
return self.url_result(
url, ie='Generic', url_transparent=True,
title=title,
age_limit=18,
uploader=uploader,
uploader_id=uploader_id)
class ThisVidPlaylistBaseIE(InfoExtractor):
_PLAYLIST_URL_RE = None
@classmethod
def _find_urls(cls, html):
for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html):
yield m.group('url')
def _generate_playlist_entries(self, url, playlist_id, html=None):
page_url = url
for page in itertools.count(1):
if not html:
html = self._download_webpage(
page_url, playlist_id, note=f'Downloading page {page}',
fatal=False) or ''
yield from self._find_urls(html)
next_page = get_element_by_class('pagination-next', html) or ''
if next_page:
# member list page
next_page = urljoin(url, self._search_regex(
r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
next_page, 'next page link', group='url', default=None))
# in case a member page should have pagination-next with empty link, not just `else:`
if next_page is None:
# playlist page
parsed_url = urllib.parse.urlparse(page_url)
base_path, _, num = parsed_url.path.rpartition('/')
num = int_or_none(num)
if num is None:
base_path, num = parsed_url.path.rstrip('/'), 1
parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}')
next_page = urllib.parse.urlunparse(parsed_url)
if page_url == next_page:
next_page = None
if not next_page:
return
page_url, html = next_page, None
def _make_playlist_result(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
title = re.split(
r'(?i)\s*\|\s*ThisVid\.com\s*$',
self._og_search_title(webpage, default=None)
or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
return self.playlist_from_matches(
self._generate_playlist_entries(url, playlist_id, webpage),
playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE)
class ThisVidMemberIE(ThisVidPlaylistBaseIE):
_VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
_TESTS = [{
'url': 'https://thisvid.com/members/2140501/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Profile',
},
'playlist_mincount': 16,
}, {
'url': 'https://thisvid.com/members/2140501/favourite_videos/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Favourite Videos',
},
'playlist_mincount': 15,
}, {
'url': 'https://thisvid.com/members/636468/public_videos/',
'info_dict': {
'id': '636468',
'title': 'Happymouth\'s Public Videos',
},
'playlist_mincount': 196,
}]
_PLAYLIST_URL_RE = ThisVidIE._VALID_URL
def _real_extract(self, url):
return self._make_playlist_result(url)
class ThisVidPlaylistIE(ThisVidPlaylistBaseIE):
_VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '6615',
'title': 'Underwear Stuff',
},
'playlist_mincount': 200,
}, {
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '1072387',
'ext': 'mp4',
'title': 'Big Italian Booty 28',
'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
'uploader_id': '367912',
'uploader': 'Jcmusclefun',
'age_limit': 18,
'display_id': 'big-italian-booty-28',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg',
},
'params': {
'noplaylist': True,
},
}]
_PLAYLIST_URL_RE = _VALID_URL
def _generate_playlist_entries(self, url, playlist_id, html=None):
for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html):
video_id = re.match(self._VALID_URL, wrapped_url).group('video_id')
yield urljoin(url, f'/videos/{video_id}/')
def _real_extract(self, url):
playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id')
if not self._yes_playlist(playlist_id, video_id):
redirect_url = urljoin(url, f'/videos/{video_id}/')
return self.url_result(redirect_url, ThisVidIE)
result = self._make_playlist_result(url)
# Fix duplicated title (`the title - the title` => `the title`)
title = result['title']
t_len = len(title)
if t_len > 5 and t_len % 2 != 0:
t_len = t_len // 2
if title[t_len] == '-':
first, second = map(str.strip, (title[:t_len], title[t_len + 1:]))
if first and first == second:
result['title'] = first
return result