[extractor] Framework for embed detection (#4307)

2022-08-01 06:52:03 +05:30 · 2022-08-01 06:52:03 +05:30 · 8f97a15d1c
commit 8f97a15d1c
parent 47304e07dc
8 changed files with 149 additions and 77 deletions
--- a/devscripts/lazy_load_template.py
+++ b/devscripts/lazy_load_template.py
@ -9,11 +9,13 @@ from ..utils import (
    write_string,
 )
 # These bloat the lazy_extractors, so allow them to passthrough silently
 ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
 class LazyLoadMetaClass(type):
    def __getattr__(cls, name):
-        # "_TESTS" bloat the lazy_extractors
+        if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
        if '_real_class' not in cls.__dict__ and name != 'get_testcases':
            write_string(
                'WARNING: Falling back to normal extractor since lazy extractor '
                f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@ -11,7 +11,7 @@ import optparse
 from inspect import getsource
 NO_ATTR = object()
-STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
+STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
 CLASS_METHODS = [
    'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
 ]
@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
    }.get(base.__name__, base.__name__) for base in ie.__bases__)
    s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
    valid_url = getattr(ie, '_VALID_URL', None)
    if not valid_url and hasattr(ie, '_make_valid_url'):
        valid_url = ie._make_valid_url()
    if valid_url:
        s += f'    _VALID_URL = {valid_url!r}\n'
    return s + '\n'.join(extra_ie_code(ie, attr_base))
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -1566,7 +1566,8 @@ class YoutubeDL:
        result_type = ie_result.get('_type', 'video')
        if result_type in ('url', 'url_transparent'):
-            ie_result['url'] = sanitize_url(ie_result['url'])
+            ie_result['url'] = sanitize_url(
                ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
            if ie_result.get('original_url'):
                extra_info.setdefault('original_url', ie_result['original_url'])
--- a/yt_dlp/extractor/brightcove.py
+++ b/yt_dlp/extractor/brightcove.py
@ -402,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):
    @staticmethod
    def _extract_url(ie, webpage):
-        urls = BrightcoveNewIE._extract_urls(ie, webpage)
+        urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
        return urls[0] if urls else None
    @staticmethod
-    def _extract_urls(ie, webpage):
+    def _extract_brightcove_urls(ie, webpage):
        # Reference:
        # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -14,6 +14,7 @@ import random
 import re
 import sys
 import time
 import types
 import urllib.parse
 import urllib.request
 import xml.etree.ElementTree
@ -23,6 +24,7 @@ from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
 from ..downloader import FileDownloader
 from ..downloader.f4m import get_base_url, remove_encrypted_media
 from ..utils import (
    IDENTITY,
    JSON_LD_RE,
    NO_DEFAULT,
    ExtractorError,
@ -59,6 +61,7 @@ from ..utils import (
    parse_m3u8_attributes,
    parse_resolution,
    sanitize_filename,
    sanitize_url,
    sanitized_Request,
    str_or_none,
    str_to_int,
@ -431,14 +434,26 @@ class InfoExtractor:
    title, description etc.
-    Subclasses of this should define a _VALID_URL regexp and, re-define the
+    Subclasses of this should also be added to the list of extractors and
-    _real_extract() and (optionally) _real_initialize() methods.
+    should define a _VALID_URL regexp and, re-define the _real_extract() and
-    Probably, they should also be added to the list of extractors.
+    (optionally) _real_initialize() methods.
    Subclasses may also override suitable() if necessary, but ensure the function
    signature is preserved and that this function imports everything it needs
    (except other extractors), so that lazy_extractors works correctly.
    Subclasses can define a list of _EMBED_REGEX, which will be searched for in
    the HTML of Generic webpages. It may also override _extract_embed_urls
    or _extract_from_webpage as necessary. While these are normally classmethods,
    _extract_from_webpage is allowed to be an instance method.
    _extract_from_webpage may raise self.StopExtraction() to stop further
    processing of the webpage and obtain exclusive rights to it. This is useful
    when the extractor cannot reliably be matched using just the URL.
    Eg: invidious/peertube instances
    Embed-only extractors can be defined by setting _VALID_URL = False.
    To support username + password (or netrc) login, the extractor must define a
    _NETRC_MACHINE and re-define _perform_login(username, password) and
    (optionally) _initialize_pre_login() methods. The _perform_login method will
@ -476,6 +491,8 @@ class InfoExtractor:
    _NETRC_MACHINE = None
    IE_DESC = None
    SEARCH_KEY = None
    _VALID_URL = None
    _EMBED_REGEX = []
    def _login_hint(self, method=NO_DEFAULT, netrc=None):
        password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
@ -499,12 +516,12 @@ class InfoExtractor:
    @classmethod
    def _match_valid_url(cls, url):
        if cls._VALID_URL is False:
            return None
        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            if '_VALID_URL' not in cls.__dict__:
                cls._VALID_URL = cls._make_valid_url()
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url)
@ -1143,10 +1160,12 @@ class InfoExtractor:
            'url': url,
        }
-    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
+    @classmethod
-        urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
+    def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
-                for m in orderedSet(map(getter, matches) if getter else matches))
+                              getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
-        return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
+        return cls.playlist_result(
            (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
            playlist_id, playlist_title, **kwargs)
    @staticmethod
    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
@ -1353,12 +1372,20 @@ class InfoExtractor:
    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')
-    def _rta_search(self, html):
+    @staticmethod
    def _rta_search(html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18
        # And then there are the jokers who advertise that they use RTA, but actually don't.
        AGE_LIMIT_MARKERS = [
            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
        ]
        if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
            return 18
        return 0
    def _media_rating_search(self, html):
@ -1965,14 +1992,9 @@ class InfoExtractor:
            else 'https:')
    def _proto_relative_url(self, url, scheme=None):
-        if url is None:
+        scheme = scheme or self.http_scheme()
-            return url
+        assert scheme.endswith(':')
-        if url.startswith('//'):
+        return sanitize_url(url, scheme=scheme[:-1])
            if scheme is None:
                scheme = self.http_scheme()
            return scheme + url
        else:
            return url
    def _sleep(self, timeout, video_id, msg_template=None):
        if msg_template is None:
@ -3767,10 +3789,12 @@ class InfoExtractor:
            headers['Ytdl-request-proxy'] = geo_verification_proxy
        return headers
-    def _generic_id(self, url):
+    @staticmethod
    def _generic_id(url):
        return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
-    def _generic_title(self, url):
+    @staticmethod
    def _generic_title(url):
        return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
    @staticmethod
@ -3816,6 +3840,37 @@ class InfoExtractor:
        self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
        return True
    @classmethod
    def extract_from_webpage(cls, ydl, url, webpage):
        ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
              else ydl.get_info_extractor(cls.ie_key()))
        yield from ie._extract_from_webpage(url, webpage) or []
    @classmethod
    def _extract_from_webpage(cls, url, webpage):
        for embed_url in orderedSet(
                cls._extract_embed_urls(url, webpage) or [], lazy=True):
            yield cls.url_result(embed_url, cls)
    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        """@returns all the embed urls on the webpage"""
        if '_EMBED_URL_RE' not in cls.__dict__:
            assert isinstance(cls._EMBED_REGEX, (list, tuple))
            for idx, regex in enumerate(cls._EMBED_REGEX):
                assert regex.count('(?P<url>') == 1, \
                    f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
            cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
        for regex in cls._EMBED_URL_RE:
            for mobj in regex.finditer(webpage):
                embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
                if cls._VALID_URL is False or cls.suitable(embed_url):
                    yield embed_url
    class StopExtraction(Exception):
        pass
 class SearchInfoExtractor(InfoExtractor):
    """
@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor):
    _MAX_RESULTS = float('inf')
-    @classmethod
+    @classproperty
-    def _make_valid_url(cls):
+    def _VALID_URL(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
    def _real_extract(self, query):
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -3,6 +3,8 @@ import re
 import urllib.parse
 import xml.etree.ElementTree
 from . import gen_extractor_classes
 from .common import InfoExtractor  # isort: split
 from .ant1newsgr import Ant1NewsGrEmbedIE
 from .anvato import AnvatoIE
 from .apa import APAIE
@ -14,7 +16,6 @@ from .blogger import BloggerIE
 from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
 from .channel9 import Channel9IE
 from .cloudflarestream import CloudflareStreamIE
 from .common import InfoExtractor
 from .commonprotocols import RtmpIE
 from .condenast import CondeNastIE
 from .dailymail import DailyMailIE
@ -115,6 +116,7 @@ from ..utils import (
    determine_ext,
    dict_get,
    float_or_none,
    format_field,
    int_or_none,
    is_html,
    js_to_json,
@ -2641,8 +2643,15 @@ class GenericIE(InfoExtractor):
        """Report information extraction."""
        self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
-    def report_detected(self, name):
+    def report_detected(self, name, num=1, note=None):
-        self._downloader.write_debug(f'Identified a {name}')
+        if num > 1:
            name += 's'
        elif not num:
            return
        else:
            num = 'a'
        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
    def _extract_rss(self, url, video_id, doc):
        NS_MAP = {
@ -2854,8 +2863,7 @@ class GenericIE(InfoExtractor):
        if not self.get_param('test', False) and not is_intentional:
            force = self.get_param('force_generic_extractor', False)
-            self.report_warning(
+            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
        first_bytes = full_response.read(512)
@ -2933,6 +2941,22 @@ class GenericIE(InfoExtractor):
            self.report_detected('Camtasia video')
            return camtasia_res
        info_dict.update({
            # it's tempting to parse this further, but you would
            # have to take into account all the variations like
            #   Video Title - Site Name
            #   Site Name | Video Title
            #   Video Title - Tagline | Site Name
            # and so on and so forth; it's just not practical
            'title': (self._og_search_title(webpage, default=None)
                      or self._html_extract_title(webpage, 'video title', default='video')),
            'description': self._og_search_description(webpage, default=None),
            'thumbnail': self._og_search_thumbnail(webpage, default=None),
            'age_limit': self._rta_search(webpage),
        })
        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
        # Sometimes embedded video player is hidden behind percent encoding
        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
        # Unescaping the whole page allows to handle those cases in a generic way
@ -2946,40 +2970,12 @@ class GenericIE(InfoExtractor):
            r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
            lambda x: unescapeHTML(x.group(0)), webpage)
-        # it's tempting to parse this further, but you would
+        # TODO: Remove
-        # have to take into account all the variations like
+        video_title, video_description, video_thumbnail, age_limit, video_uploader = \
-        #   Video Title - Site Name
+            info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
        #   Site Name | Video Title
        #   Video Title - Tagline | Site Name
        # and so on and so forth; it's just not practical
        video_title = (self._og_search_title(webpage, default=None)
                       or self._html_extract_title(webpage, 'video title', default='video'))
-        # Try to detect age limit automatically
+        # TODO: Move Embeds
-        age_limit = self._rta_search(webpage)
+        self._downloader.write_debug('Looking for single embeds')
        # And then there are the jokers who advertise that they use RTA,
        # but actually don't.
        AGE_LIMIT_MARKERS = [
            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
        ]
        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
            age_limit = 18
        # video uploader is domain name
        video_uploader = self._search_regex(
            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
        video_description = self._og_search_description(webpage, default=None)
        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
        info_dict.update({
            'title': video_title,
            'description': video_description,
            'thumbnail': video_thumbnail,
            'age_limit': age_limit,
        })
        self._downloader.write_debug('Looking for video embeds')
        # Look for Brightcove Legacy Studio embeds
        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
@ -2998,7 +2994,7 @@ class GenericIE(InfoExtractor):
            }
        # Look for Brightcove New Studio embeds
-        bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
+        bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
        if bc_urls:
            return self.playlist_from_matches(
                bc_urls, video_id, video_title,
@ -3246,7 +3242,7 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
        # Look for embedded Spotify player
-        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+        spotify_urls = SpotifyBaseIE._extract_urls(webpage)
        if spotify_urls:
            return self.playlist_from_matches(spotify_urls, video_id, video_title)
@ -3837,6 +3833,30 @@ class GenericIE(InfoExtractor):
        tiktok_urls = TikTokIE._extract_urls(webpage)
        if tiktok_urls:
            return self.playlist_from_matches(tiktok_urls, video_id, video_title)
        # TODO: END: Move Embeds
        self._downloader.write_debug('Looking for embeds')
        embeds = []
        for ie in gen_extractor_classes():
            gen = ie.extract_from_webpage(self._downloader, url, webpage)
            current_embeds = []
            try:
                while True:
                    current_embeds.append(next(gen))
            except self.StopExtraction:
                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
                                     embeds and 'discarding other embeds')
                embeds = current_embeds
                break
            except StopIteration:
                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
                embeds.extend(current_embeds)
        del current_embeds
        if len(embeds) == 1:
            return {**info_dict, **embeds[0]}
        elif embeds:
            return self.playlist_result(embeds, **info_dict)
        # Look for HTML5 media
        entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@ -4119,7 +4139,6 @@ class GenericIE(InfoExtractor):
                entries.append(self.url_result(video_url, 'Youtube'))
                continue
            # here's a fun little line of code for you:
            video_id = os.path.splitext(video_id)[0]
            headers = {
                'referer': full_response.geturl()
--- a/yt_dlp/extractor/spotify.py
+++ b/yt_dlp/extractor/spotify.py
@ -98,7 +98,7 @@ class SpotifyBaseIE(InfoExtractor):
        }
    @classmethod
-    def _extract_embed_urls(cls, webpage):
+    def _extract_urls(cls, webpage):
        return re.findall(
            r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
            webpage)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -705,13 +705,13 @@ def sanitize_path(s, force=False):
    return os.path.join(*sanitized_path)
-def sanitize_url(url):
+def sanitize_url(url, *, scheme='http'):
    # Prepend protocol-less URLs with `http:` scheme in order to mitigate
    # the number of unwanted failures due to missing protocol
    if url is None:
        return
    elif url.startswith('//'):
-        return 'http:%s' % url
+        return f'{scheme}:{url}'
    # Fix some common typos seen so far
    COMMON_TYPOS = (
        # https://github.com/ytdl-org/youtube-dl/issues/15649