[youtube] Improvements to JS player extraction (See desc) (#860)
* fallback player url extraction when it fails to be extracted from the webpage * don't download js player unnecessarily for clients that don't require it * try to extract js player url from any additional client configs * ability to skip the js player usage/download using `player_skip=js` * ability to skip the initial webpage download using `player_skip=webpage` known issue: * authentication for multi-channel accounts and multi-account cookies may not work correctly if the webpage or client configs are skipped * formats from the web client requiring signature decryption will be skipped if player js extraction is skipped Authored by: coletdjnz
This commit is contained in:
parent
bccdbd22d5
commit
b6de707d13
2 changed files with 57 additions and 28 deletions
|
@ -1436,7 +1436,7 @@ The following extractors use this feature:
|
||||||
* **youtube**
|
* **youtube**
|
||||||
* `skip`: `hls` or `dash` (or both) to skip download of the respective manifests
|
* `skip`: `hls` or `dash` (or both) to skip download of the respective manifests
|
||||||
* `player_client`: Clients to extract video data from. The main clients are `web`, `android`, `ios`, `mweb`. These also have `_music`, `_embedded`, `_agegate`, and `_creator` variants (Eg: `web_embedded`) (`mweb` has only `_agegate`). By default, `android,web` is used, but the agegate and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can also use `all` to use all the clients
|
* `player_client`: Clients to extract video data from. The main clients are `web`, `android`, `ios`, `mweb`. These also have `_music`, `_embedded`, `_agegate`, and `_creator` variants (Eg: `web_embedded`) (`mweb` has only `_agegate`). By default, `android,web` is used, but the agegate and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can also use `all` to use all the clients
|
||||||
* `player_skip`: `configs` - skip any requests for client configs and use defaults
|
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
|
||||||
* `include_live_dash`: Include live dash formats (These formats don't download properly)
|
* `include_live_dash`: Include live dash formats (These formats don't download properly)
|
||||||
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side).
|
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side).
|
||||||
* `max_comments`: Maximum amount of comments to download (default all).
|
* `max_comments`: Maximum amount of comments to download (default all).
|
||||||
|
|
|
@ -117,6 +117,7 @@ INNERTUBE_CLIENTS = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
'android_embedded': {
|
'android_embedded': {
|
||||||
'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
|
'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
|
||||||
|
@ -126,7 +127,8 @@ INNERTUBE_CLIENTS = {
|
||||||
'clientVersion': '16.20',
|
'clientVersion': '16.20',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 55
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 55,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
'android_music': {
|
'android_music': {
|
||||||
'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
|
'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
|
||||||
|
@ -138,6 +140,7 @@ INNERTUBE_CLIENTS = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
'android_creator': {
|
'android_creator': {
|
||||||
'INNERTUBE_CONTEXT': {
|
'INNERTUBE_CONTEXT': {
|
||||||
|
@ -146,7 +149,8 @@ INNERTUBE_CLIENTS = {
|
||||||
'clientVersion': '21.24.100',
|
'clientVersion': '21.24.100',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 14
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
# ios has HLS live streams
|
# ios has HLS live streams
|
||||||
# See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
|
# See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
|
||||||
|
@ -158,7 +162,8 @@ INNERTUBE_CLIENTS = {
|
||||||
'clientVersion': '16.20',
|
'clientVersion': '16.20',
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 5
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
'ios_embedded': {
|
'ios_embedded': {
|
||||||
'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
|
'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
|
||||||
|
@ -168,7 +173,8 @@ INNERTUBE_CLIENTS = {
|
||||||
'clientVersion': '16.20',
|
'clientVersion': '16.20',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 66
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 66,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
'ios_music': {
|
'ios_music': {
|
||||||
'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
|
'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
|
||||||
|
@ -179,7 +185,8 @@ INNERTUBE_CLIENTS = {
|
||||||
'clientVersion': '4.32',
|
'clientVersion': '4.32',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 26
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 26,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
'ios_creator': {
|
'ios_creator': {
|
||||||
'INNERTUBE_CONTEXT': {
|
'INNERTUBE_CONTEXT': {
|
||||||
|
@ -188,7 +195,8 @@ INNERTUBE_CLIENTS = {
|
||||||
'clientVersion': '21.24.100',
|
'clientVersion': '21.24.100',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'INNERTUBE_CONTEXT_CLIENT_NAME': 15
|
'INNERTUBE_CONTEXT_CLIENT_NAME': 15,
|
||||||
|
'REQUIRE_JS_PLAYER': False
|
||||||
},
|
},
|
||||||
# mweb has 'ultralow' formats
|
# mweb has 'ultralow' formats
|
||||||
# See: https://github.com/yt-dlp/yt-dlp/pull/557
|
# See: https://github.com/yt-dlp/yt-dlp/pull/557
|
||||||
|
@ -215,6 +223,7 @@ def build_innertube_clients():
|
||||||
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
|
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
|
||||||
ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
|
ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
|
||||||
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
|
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
|
||||||
|
ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
|
||||||
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
|
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
|
||||||
ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
|
ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
|
||||||
|
|
||||||
|
@ -1858,14 +1867,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
self._code_cache = {}
|
self._code_cache = {}
|
||||||
self._player_cache = {}
|
self._player_cache = {}
|
||||||
|
|
||||||
def _extract_player_url(self, ytcfg=None, webpage=None):
|
def _extract_player_url(self, *ytcfgs, webpage=None):
|
||||||
player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str)
|
player_url = traverse_obj(
|
||||||
if not player_url and webpage:
|
ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'),
|
||||||
player_url = self._search_regex(
|
get_all=False, expected_type=compat_str)
|
||||||
r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
|
|
||||||
webpage, 'player URL', fatal=False)
|
|
||||||
if not player_url:
|
if not player_url:
|
||||||
return None
|
return
|
||||||
if player_url.startswith('//'):
|
if player_url.startswith('//'):
|
||||||
player_url = 'https:' + player_url
|
player_url = 'https:' + player_url
|
||||||
elif not re.match(r'https?://', player_url):
|
elif not re.match(r'https?://', player_url):
|
||||||
|
@ -1873,6 +1880,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'https://www.youtube.com', player_url)
|
'https://www.youtube.com', player_url)
|
||||||
return player_url
|
return player_url
|
||||||
|
|
||||||
|
def _download_player_url(self, video_id, fatal=False):
|
||||||
|
res = self._download_webpage(
|
||||||
|
'https://www.youtube.com/iframe_api',
|
||||||
|
note='Downloading iframe API JS', video_id=video_id, fatal=fatal)
|
||||||
|
if res:
|
||||||
|
player_version = self._search_regex(
|
||||||
|
r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal)
|
||||||
|
if player_version:
|
||||||
|
return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js'
|
||||||
|
|
||||||
def _signature_cache_id(self, example_sig):
|
def _signature_cache_id(self, example_sig):
|
||||||
""" Return a string representation of a signature """
|
""" Return a string representation of a signature """
|
||||||
return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
|
return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
|
||||||
|
@ -2462,7 +2479,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
|
session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
|
||||||
syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
|
syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
|
||||||
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False)
|
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
|
||||||
headers = self.generate_api_headers(
|
headers = self.generate_api_headers(
|
||||||
player_ytcfg, identity_token, syncid,
|
player_ytcfg, identity_token, syncid,
|
||||||
default_client=client, session_index=session_index)
|
default_client=client, session_index=session_index)
|
||||||
|
@ -2507,7 +2524,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
|
webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
|
||||||
return self.extract_ytcfg(video_id, webpage) or {}
|
return self.extract_ytcfg(video_id, webpage) or {}
|
||||||
|
|
||||||
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token):
|
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, identity_token):
|
||||||
initial_pr = None
|
initial_pr = None
|
||||||
if webpage:
|
if webpage:
|
||||||
initial_pr = self._extract_yt_initial_variable(
|
initial_pr = self._extract_yt_initial_variable(
|
||||||
|
@ -2516,6 +2533,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
original_clients = clients
|
original_clients = clients
|
||||||
clients = clients[::-1]
|
clients = clients[::-1]
|
||||||
|
prs = []
|
||||||
|
|
||||||
def append_client(client_name):
|
def append_client(client_name):
|
||||||
if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
|
if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
|
||||||
|
@ -2525,23 +2543,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
# extraction of some data. So we return the initial_pr with formats
|
# extraction of some data. So we return the initial_pr with formats
|
||||||
# stripped out even if not requested by the user
|
# stripped out even if not requested by the user
|
||||||
# See: https://github.com/yt-dlp/yt-dlp/issues/501
|
# See: https://github.com/yt-dlp/yt-dlp/issues/501
|
||||||
yielded_pr = False
|
|
||||||
if initial_pr:
|
if initial_pr:
|
||||||
pr = dict(initial_pr)
|
pr = dict(initial_pr)
|
||||||
pr['streamingData'] = None
|
pr['streamingData'] = None
|
||||||
yielded_pr = True
|
prs.append(pr)
|
||||||
yield pr
|
|
||||||
|
|
||||||
last_error = None
|
last_error = None
|
||||||
|
tried_iframe_fallback = False
|
||||||
|
player_url = None
|
||||||
while clients:
|
while clients:
|
||||||
client = clients.pop()
|
client = clients.pop()
|
||||||
player_ytcfg = master_ytcfg if client == 'web' else {}
|
player_ytcfg = master_ytcfg if client == 'web' else {}
|
||||||
if 'configs' not in self._configuration_arg('player_skip'):
|
if 'configs' not in self._configuration_arg('player_skip'):
|
||||||
player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
|
player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
|
||||||
|
|
||||||
|
player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
|
||||||
|
require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER')
|
||||||
|
if 'js' in self._configuration_arg('player_skip'):
|
||||||
|
require_js_player = False
|
||||||
|
player_url = None
|
||||||
|
|
||||||
|
if not player_url and not tried_iframe_fallback and require_js_player:
|
||||||
|
player_url = self._download_player_url(video_id)
|
||||||
|
tried_iframe_fallback = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
|
pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
|
||||||
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr)
|
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url if require_js_player else None, initial_pr)
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
if last_error:
|
if last_error:
|
||||||
self.report_warning(last_error)
|
self.report_warning(last_error)
|
||||||
|
@ -2549,8 +2577,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if pr:
|
if pr:
|
||||||
yielded_pr = True
|
prs.append(pr)
|
||||||
yield pr
|
|
||||||
|
|
||||||
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
|
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
|
||||||
if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header():
|
if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header():
|
||||||
|
@ -2559,9 +2586,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
append_client(f'{client}_agegate')
|
append_client(f'{client}_agegate')
|
||||||
|
|
||||||
if last_error:
|
if last_error:
|
||||||
if not yielded_pr:
|
if not len(prs):
|
||||||
raise last_error
|
raise last_error
|
||||||
self.report_warning(last_error)
|
self.report_warning(last_error)
|
||||||
|
return prs, player_url
|
||||||
|
|
||||||
def _extract_formats(self, streaming_data, video_id, player_url, is_live):
|
def _extract_formats(self, streaming_data, video_id, player_url, is_live):
|
||||||
itags, stream_ids = [], []
|
itags, stream_ids = [], []
|
||||||
|
@ -2708,16 +2736,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
base_url = self.http_scheme() + '//www.youtube.com/'
|
base_url = self.http_scheme() + '//www.youtube.com/'
|
||||||
webpage_url = base_url + 'watch?v=' + video_id
|
webpage_url = base_url + 'watch?v=' + video_id
|
||||||
webpage = self._download_webpage(
|
webpage = None
|
||||||
webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
|
if 'webpage' not in self._configuration_arg('player_skip'):
|
||||||
|
webpage = self._download_webpage(
|
||||||
|
webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
|
||||||
|
|
||||||
master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
|
master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
|
||||||
player_url = self._extract_player_url(master_ytcfg, webpage)
|
|
||||||
identity_token = self._extract_identity_token(webpage, video_id)
|
identity_token = self._extract_identity_token(webpage, video_id)
|
||||||
|
|
||||||
player_responses = list(self._extract_player_responses(
|
player_responses, player_url = self._extract_player_responses(
|
||||||
self._get_requested_clients(url, smuggled_data),
|
self._get_requested_clients(url, smuggled_data),
|
||||||
video_id, webpage, master_ytcfg, player_url, identity_token))
|
video_id, webpage, master_ytcfg, identity_token)
|
||||||
|
|
||||||
get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
|
get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue