From 8b7539d27c0a47d8d08e0522bdb66c571483377b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 29 Jan 2022 03:25:35 +0530 Subject: [PATCH] Implement `--add-header` without modifying `std_headers` Closes #2526, #1614 --- README.md | 5 ++--- yt_dlp/YoutubeDL.py | 8 ++++++-- yt_dlp/__init__.py | 15 ++++++--------- yt_dlp/extractor/instagram.py | 3 +-- yt_dlp/extractor/mildom.py | 3 +-- yt_dlp/extractor/openload.py | 3 +-- yt_dlp/extractor/rtve.py | 3 +-- yt_dlp/extractor/vimeo.py | 3 +-- yt_dlp/options.py | 7 +++---- yt_dlp/utils.py | 7 ++++++- 10 files changed, 28 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 1aab0ba94..ce5af129e 100644 --- a/README.md +++ b/README.md @@ -737,9 +737,6 @@ You can also fork the project on github and run your fork's [build workflow](.gi --prefer-insecure Use an unencrypted connection to retrieve information about the video (Currently supported only for YouTube) - --user-agent UA Specify a custom user agent - --referer URL Specify a custom referer, use if the video - access is restricted to one domain --add-header FIELD:VALUE Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times @@ -1866,6 +1863,8 @@ While these options are redundant, they are still expected to be used due to the --reject-title REGEX --match-filter "title !~= (?i)REGEX" --min-views COUNT --match-filter "view_count >=? COUNT" --max-views COUNT --match-filter "view_count <=? COUNT" + --user-agent UA --add-header "User-Agent:UA" + --referer URL --add-header "Referer:URL" #### Not recommended diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 0aee3b122..49143cb16 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -83,6 +83,7 @@ from .utils import ( make_dir, make_HTTPS_handler, MaxDownloadsReached, + merge_headers, network_exceptions, number_of_digits, orderedSet, @@ -332,6 +333,7 @@ class YoutubeDL(object): nocheckcertificate: Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. + http_headers: A dictionary of custom headers to be used for all requests proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification on geo-restricted sites. @@ -647,6 +649,9 @@ class YoutubeDL(object): else self.params['format'] if callable(self.params['format']) else self.build_format_selector(self.params['format'])) + # Set http_headers defaults according to std_headers + self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) + self._setup_opener() if auto_init: @@ -2250,8 +2255,7 @@ class YoutubeDL(object): return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): - res = std_headers.copy() - res.update(info_dict.get('http_headers') or {}) + res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) cookies = self._calc_cookies(info_dict) if cookies: diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c87c5b6df..926b5cad3 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -41,6 +41,7 @@ from .utils import ( SameFileError, setproctitle, std_headers, + traverse_obj, write_string, ) from .update import run_update @@ -75,20 +76,15 @@ def _real_main(argv=None): parser, opts, args = parseOpts(argv) warnings, deprecation_warnings = [], [] - # Set user agent if opts.user_agent is not None: - std_headers['User-Agent'] = opts.user_agent - - # Set referer + opts.headers.setdefault('User-Agent', opts.user_agent) if opts.referer is not None: - std_headers['Referer'] = opts.referer - - # Custom HTTP headers - std_headers.update(opts.headers) + opts.headers.setdefault('Referer', opts.referer) # Dump user agent if opts.dump_user_agent: - write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) + ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) + write_string(f'{ua}\n', out=sys.stdout) sys.exit(0) # Batch file verification @@ -767,6 +763,7 @@ def _real_main(argv=None): 'legacyserverconnect': opts.legacy_server_connect, 'nocheckcertificate': opts.no_check_certificate, 'prefer_insecure': opts.prefer_insecure, + 'http_headers': opts.headers, 'proxy': opts.proxy, 'socket_timeout': opts.socket_timeout, 'bidi_workaround': opts.bidi_workaround, diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index a2cc9f748..3bb786d6a 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -17,7 +17,6 @@ from ..utils import ( get_element_by_attribute, int_or_none, lowercase_escape, - std_headers, str_or_none, str_to_int, traverse_obj, @@ -503,7 +502,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE): '%s' % rhx_gis, '', '%s:%s' % (rhx_gis, csrf_token), - '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), + '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']), ] # try all of the ways to generate a GIS query, and not only use the diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py index ddeaa7021..b5a2e17f2 100644 --- a/yt_dlp/extractor/mildom.py +++ b/yt_dlp/extractor/mildom.py @@ -8,7 +8,6 @@ import json from .common import InfoExtractor from ..utils import ( - std_headers, update_url_query, random_uuidv4, try_get, @@ -70,7 +69,7 @@ class MildomBaseIE(InfoExtractor): 'clu': '', 'wh': '1919*810', 'rtm': self.iso_timestamp(), - 'ua': std_headers['User-Agent'], + 'ua': self.get_param('http_headers')['User-Agent'], }).encode('utf8')).decode('utf8').replace('\n', ''), }).encode('utf8')) self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization') diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index 6ec54509b..36927009d 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -16,7 +16,6 @@ from ..utils import ( ExtractorError, get_exe_version, is_outdated_version, - std_headers, Popen, ) @@ -208,7 +207,7 @@ class PhantomJSwrapper(object): replaces = self.options replaces['url'] = url - user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + user_agent = headers.get('User-Agent') or self.get_param('http_headers')['User-Agent'] replaces['ua'] = user_agent.replace('"', '\\"') replaces['jscode'] = jscode diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index af1bb943d..7a1dc6f32 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -17,7 +17,6 @@ from ..utils import ( qualities, remove_end, remove_start, - std_headers, try_get, ) @@ -71,7 +70,7 @@ class RTVEALaCartaIE(InfoExtractor): }] def _real_initialize(self): - user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8') self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, None, 'Fetching manager info')['manager'] diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 1a9fd00e4..77ffb4bfb 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -28,7 +28,6 @@ from ..utils import ( parse_qs, sanitized_Request, smuggle_url, - std_headers, str_or_none, try_get, unified_timestamp, @@ -758,7 +757,7 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_extract(self, url): url, data = unsmuggle_url(url, {}) - headers = std_headers.copy() + headers = self.get_param('http_headers').copy() if 'http_headers' in data: headers.update(data['http_headers']) if 'Referer' not in headers: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9908f3975..17d8d5da6 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -860,17 +860,16 @@ def create_parser(): workarounds.add_option( '--user-agent', metavar='UA', dest='user_agent', - help='Specify a custom user agent') + help=optparse.SUPPRESS_HELP) workarounds.add_option( '--referer', metavar='URL', dest='referer', default=None, - help='Specify a custom referer, use if the video access is restricted to one domain', - ) + help=optparse.SUPPRESS_HELP) workarounds.add_option( '--add-header', metavar='FIELD:VALUE', dest='headers', default={}, type='str', action='callback', callback=_dict_from_options_callback, - callback_kwargs={'multiple_keys': False, 'process_key': None}, + callback_kwargs={'multiple_keys': False}, help='Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times', ) workarounds.add_option( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index ef2c6bb24..be0c69d8f 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1372,7 +1372,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): if url != url_escaped: req = update_Request(req, url=url_escaped) - for h, v in std_headers.items(): + for h, v in self._params.get('http_headers', std_headers).items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: @@ -5436,3 +5436,8 @@ class WebSocketsWrapper(): has_websockets = bool(compat_websockets) + + +def merge_headers(*dicts): + """Merge dicts of network headers case insensitively, prioritizing the latter ones""" + return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}