[extractor/youtube] Add extractor-arg include_duplicate_formats
This commit is contained in:
parent
c795c39f27
commit
86cb922118
2 changed files with 23 additions and 16 deletions
|
@ -1787,6 +1787,7 @@ The following extractors use this feature:
|
||||||
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
|
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
|
||||||
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
|
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
|
||||||
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
|
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
|
||||||
|
* `include_duplicate_formats`: Extract formats with identical content but different URLs or protocol. This is useful if some of the formats are unavailable or throttled.
|
||||||
* `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8)
|
* `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8)
|
||||||
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
|
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
|
||||||
* `innertube_key`: Innertube API key to use for all API requests
|
* `innertube_key`: Innertube API key to use for all API requests
|
||||||
|
|
|
@ -3640,6 +3640,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
|
'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
|
||||||
])
|
])
|
||||||
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...))
|
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...))
|
||||||
|
all_formats = self._configuration_arg('include_duplicate_formats')
|
||||||
|
|
||||||
for fmt in streaming_formats:
|
for fmt in streaming_formats:
|
||||||
if fmt.get('targetDurationSec'):
|
if fmt.get('targetDurationSec'):
|
||||||
|
@ -3648,8 +3649,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
itag = str_or_none(fmt.get('itag'))
|
itag = str_or_none(fmt.get('itag'))
|
||||||
audio_track = fmt.get('audioTrack') or {}
|
audio_track = fmt.get('audioTrack') or {}
|
||||||
stream_id = (itag, audio_track.get('id'), fmt.get('isDrc'))
|
stream_id = (itag, audio_track.get('id'), fmt.get('isDrc'))
|
||||||
if stream_id in stream_ids:
|
if not all_formats:
|
||||||
continue
|
if stream_id in stream_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
quality = fmt.get('quality')
|
quality = fmt.get('quality')
|
||||||
height = int_or_none(fmt.get('height'))
|
height = int_or_none(fmt.get('height'))
|
||||||
|
@ -3739,7 +3741,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
|
try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
|
||||||
try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
|
try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
|
||||||
throttled and 'THROTTLED', is_damaged and 'DAMAGED',
|
throttled and 'THROTTLED', is_damaged and 'DAMAGED',
|
||||||
self.get_param('verbose') and client_name,
|
(self.get_param('verbose') or all_formats) and client_name,
|
||||||
delim=', '),
|
delim=', '),
|
||||||
# Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
|
# Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
|
||||||
'source_preference': -10 if throttled else -5 if itag == '22' else -1,
|
'source_preference': -10 if throttled else -5 if itag == '22' else -1,
|
||||||
|
@ -3762,26 +3764,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
if mime_mobj:
|
if mime_mobj:
|
||||||
dct['ext'] = mimetype2ext(mime_mobj.group(1))
|
dct['ext'] = mimetype2ext(mime_mobj.group(1))
|
||||||
dct.update(parse_codecs(mime_mobj.group(2)))
|
dct.update(parse_codecs(mime_mobj.group(2)))
|
||||||
|
if itag:
|
||||||
|
itags[itag].add(('https', dct.get('language')))
|
||||||
|
stream_ids.append(stream_id)
|
||||||
single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec'))
|
single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec'))
|
||||||
if single_stream and dct.get('ext'):
|
if single_stream and dct.get('ext'):
|
||||||
dct['container'] = dct['ext'] + '_dash'
|
dct['container'] = dct['ext'] + '_dash'
|
||||||
if single_stream or itag == '17':
|
|
||||||
CHUNK_SIZE = 10 << 20
|
CHUNK_SIZE = 10 << 20
|
||||||
dct.update({
|
if dct['filesize']:
|
||||||
|
yield {
|
||||||
|
**dct,
|
||||||
|
'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'],
|
||||||
'protocol': 'http_dash_segments',
|
'protocol': 'http_dash_segments',
|
||||||
'fragments': [{
|
'fragments': [{
|
||||||
'url': update_url_query(dct['url'], {
|
'url': update_url_query(dct['url'], {
|
||||||
'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, dct["filesize"])}'
|
'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, dct["filesize"])}'
|
||||||
})
|
})
|
||||||
} for range_start in range(0, dct['filesize'], CHUNK_SIZE)]
|
} for range_start in range(0, dct['filesize'], CHUNK_SIZE)]
|
||||||
} if itag != '17' and dct['filesize'] else {
|
}
|
||||||
'downloader_options': {'http_chunk_size': CHUNK_SIZE}
|
if not all_formats:
|
||||||
})
|
continue
|
||||||
|
dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
|
||||||
if itag:
|
|
||||||
itags[itag].add(('https', dct.get('language')))
|
|
||||||
stream_ids.append(stream_id)
|
|
||||||
yield dct
|
yield dct
|
||||||
|
|
||||||
needs_live_processing = self._needs_live_processing(live_status, duration)
|
needs_live_processing = self._needs_live_processing(live_status, duration)
|
||||||
|
@ -3803,11 +3807,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
def process_manifest_format(f, proto, client_name, itag):
|
def process_manifest_format(f, proto, client_name, itag):
|
||||||
key = (proto, f.get('language'))
|
key = (proto, f.get('language'))
|
||||||
if key in itags[itag]:
|
if not all_formats and key in itags[itag]:
|
||||||
return False
|
return False
|
||||||
itags[itag].add(key)
|
itags[itag].add(key)
|
||||||
|
|
||||||
if any(p != proto for p, _ in itags[itag]):
|
if itag and all_formats:
|
||||||
|
f['format_id'] = f'{itag}-{proto}'
|
||||||
|
elif any(p != proto for p, _ in itags[itag]):
|
||||||
f['format_id'] = f'{itag}-{proto}'
|
f['format_id'] = f'{itag}-{proto}'
|
||||||
elif itag:
|
elif itag:
|
||||||
f['format_id'] = itag
|
f['format_id'] = itag
|
||||||
|
|
Loading…
Reference in a new issue