[Ted] Rewrite extractor (#2359)

Closes #2343 Authored by: pukkandan, trassshhub
2022-01-20 00:04:20 +08:00 · 2022-01-20 00:04:20 +08:00 · 4259402c56
commit 4259402c56
parent dfb7f2a25d
4 changed files with 194 additions and 314 deletions
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@ -13,7 +13,7 @@ from test.helper import FakeYDL, md5, is_download_test
 from yt_dlp.extractor import (
    YoutubeIE,
    DailymotionIE,
-    TEDIE,
+    TedTalkIE,
    VimeoIE,
    WallaIE,
    CeskaTelevizeIE,
@ -141,7 +141,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
@is_download_test
 class TestTedSubtitles(BaseTestSubtitles):
    url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
-    IE = TEDIE
+    IE = TedTalkIE
    def test_allsubtitles(self):
        self.DL.params['writesubtitles'] = True
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -1522,7 +1522,12 @@ from .teachingchannel import TeachingChannelIE
 from .teamcoco import TeamcocoIE
 from .teamtreehouse import TeamTreeHouseIE
 from .techtalks import TechTalksIE
-from .ted import TEDIE
+from .ted import (
    TedEmbedIE,
    TedPlaylistIE,
    TedSeriesIE,
    TedTalkIE,
 )
 from .tele5 import Tele5IE
 from .tele13 import Tele13IE
 from .telebruxelles import TeleBruxellesIE
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -115,6 +115,7 @@ from .channel9 import Channel9IE
 from .vshare import VShareIE
 from .mediasite import MediasiteIE
 from .springboardplatform import SpringboardPlatformIE
 from .ted import TedEmbedIE
 from .yapfiles import YapFilesIE
 from .vice import ViceIE
 from .xfileshare import XFileShareIE
@ -3174,10 +3175,9 @@ class GenericIE(InfoExtractor):
            return self.url_result(mobj.group('url'), 'Tvigle')
        # Look for embedded TED player
-        mobj = re.search(
+        ted_urls = TedEmbedIE._extract_urls(webpage)
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
+        if ted_urls:
-        if mobj is not None:
+            return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
            return self.url_result(mobj.group('url'), 'TED')
        # Look for embedded Ustream videos
        ustream_url = UstreamIE._extract_url(webpage)
--- a/yt_dlp/extractor/ted.py
+++ b/yt_dlp/extractor/ted.py
@ -1,274 +1,105 @@
-from __future__ import unicode_literals
+import itertools
 import json
 import re
 from .common import InfoExtractor
 from ..compat import (
    compat_str,
    compat_urlparse
 )
 from ..utils import (
    extract_attributes,
    float_or_none,
    int_or_none,
    str_to_int,
    try_get,
    url_or_none,
    unified_strdate,
    parse_duration,
 )
-class TEDIE(InfoExtractor):
+class TedBaseIE(InfoExtractor):
-    IE_NAME = 'ted'
+    _VALID_URL_BASE = r'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
-    _VALID_URL = r'''(?x)
+
-        (?P<proto>https?://)
+    def _parse_playlist(self, playlist):
-        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
+        for entry in try_get(playlist, lambda x: x['videos']['nodes'], list):
-        (
+            if entry.get('__typename') == 'Video' and entry.get('canonicalUrl'):
-            (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
+                yield self.url_result(entry['canonicalUrl'], TedTalkIE.ie_key())
-            |
+
-            ((?P<type_talk>talks)) # We have a simple talk
+
-            |
+class TedTalkIE(TedBaseIE):
-            (?P<type_watch>watch)/[^/]+/[^/]+
+    _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type='talks')
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
-        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
+        'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
-        'md5': 'b0ce2b05ca215042124fbc9e3886493a',
+        'md5': '47e82c666d9c3261d4fe74748a90aada',
        'info_dict': {
-            'id': '102',
+            'id': '86532',
            'ext': 'mp4',
-            'title': 'The illusion of consciousness',
+            'title': 'How to break down barriers and not accept limits',
-            'description': ('Philosopher Dan Dennett makes a compelling '
+            'description': 'md5:000707cece219d1e165b11550d612331',
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 853,
            'duration': 1308,
            'view_count': int,
-            'comment_count': int,
+            'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
-            'tags': list,
+            'uploader': 'Candace Parker',
            'duration': 676.0,
            'upload_date': '20220114',
            'release_date': '20211201',
            'thumbnail': r're:http.*\.jpg',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # missing HTTP bitrates
        'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
        'info_dict': {
            'id': '6069',
            'ext': 'mp4',
            'title': 'The beauty and power of algorithms',
            'thumbnail': r're:^https?://.+\.jpg',
            'description': 'md5:734e352710fb00d840ab87ae31aaf688',
            'uploader': 'Vishal Sikka',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
            'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'webm',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # no nativeDownloads
        'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
        'info_dict': {
            'id': '1792',
            'ext': 'mp4',
            'title': 'The orchestra in my mouth',
            'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
            'uploader': 'Tom Thum',
            'view_count': int,
            'comment_count': int,
            'tags': list,
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # with own formats and private Youtube external
        'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
        'only_matching': True,
    }]
    _NATIVE_FORMATS = {
        'low': {'width': 320, 'height': 180},
        'medium': {'width': 512, 'height': 288},
        'high': {'width': 854, 'height': 480},
    }
    def _extract_info(self, webpage):
        info_json = self._search_regex(
            r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
            webpage, 'info json')
        return json.loads(info_json)
    def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url, re.VERBOSE)
+        display_id = self._match_id(url)
-        if m.group('type').startswith('embed'):
+        webpage = self._download_webpage(url, display_id)
-            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
+        talk_info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['videoData']
-            return self.url_result(desktop_url, 'TED')
+        video_id = talk_info['id']
-        name = m.group('name')
+        playerData = self._parse_json(talk_info.get('playerData'), video_id)
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)
    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''
        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        playlist_entries = []
        for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
            attrs = extract_attributes(entry)
            entry_url = compat_urlparse.urljoin(url, attrs['href'])
            playlist_entries.append(self.url_result(entry_url, self.ie_key()))
        final_url = self._og_search_url(webpage, fatal=False)
        playlist_id = (
            re.match(self._VALID_URL, final_url).group('playlist_id')
            if final_url else None)
        return self.playlist_result(
            playlist_entries, playlist_id=playlist_id,
            playlist_title=self._og_search_title(webpage, fatal=False),
            playlist_description=self._og_search_description(webpage))
    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        info = self._extract_info(webpage)
        data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
        talk_info = data['talks'][0]
        title = talk_info['title'].strip()
        downloads = talk_info.get('downloads') or {}
        native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
        formats = [{
            'url': format_url,
            'format_id': format_id,
        } for (format_id, format_url) in native_downloads.items() if format_url is not None]
        subtitled_downloads = downloads.get('subtitledDownloads') or {}
        for lang, subtitled_download in subtitled_downloads.items():
            for q in self._NATIVE_FORMATS:
                q_url = subtitled_download.get(q)
                if not q_url:
                    continue
                formats.append({
                    'url': q_url,
                    'format_id': '%s-%s' % (q, lang),
                    'language': lang,
                })
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
                if finfo:
                    f.update(finfo)
        player_talk = talk_info['player_talks'][0]
        resources_ = player_talk.get('resources') or talk_info.get('resources')
        http_url = None
-        for format_id, resources in resources_.items():
+        formats, subtitles = [], {}
        for format_id, resources in (playerData.get('resources') or {}).items():
            if format_id == 'hls':
-                if not isinstance(resources, dict):
+                stream_url = url_or_none(try_get(resources, lambda x: x['stream']))
                    continue
                stream_url = url_or_none(resources.get('stream'))
                if not stream_url:
                    continue
-                formats.extend(self._extract_m3u8_formats(
+                m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
-                    stream_url, video_name, 'mp4', m3u8_id=format_id,
+                    stream_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
-                    fatal=False))
+                formats.extend(m3u8_formats)
-            else:
+                subtitles = self._merge_subtitles(subtitles, m3u8_subs)
-                if not isinstance(resources, list):
+                continue
-                    continue
+
-                if format_id == 'h264':
+            if not isinstance(resources, list):
-                    for resource in resources:
+                continue
-                        h264_url = resource.get('file')
+            if format_id == 'h264':
-                        if not h264_url:
+                for resource in resources:
-                            continue
+                    h264_url = resource.get('file')
-                        bitrate = int_or_none(resource.get('bitrate'))
+                    if not h264_url:
-                        formats.append({
+                        continue
-                            'url': h264_url,
+                    bitrate = int_or_none(resource.get('bitrate'))
-                            'format_id': '%s-%sk' % (format_id, bitrate),
+                    formats.append({
-                            'tbr': bitrate,
+                        'url': h264_url,
-                        })
+                        'format_id': '%s-%sk' % (format_id, bitrate),
-                        if re.search(r'\d+k', h264_url):
+                        'tbr': bitrate,
-                            http_url = h264_url
+                    })
-                elif format_id == 'rtmp':
+                    if re.search(r'\d+k', h264_url):
-                    streamer = talk_info.get('streamer')
+                        http_url = h264_url
-                    if not streamer:
+            elif format_id == 'rtmp':
-                        continue
+                streamer = talk_info.get('streamer')
-                    for resource in resources:
+                if not streamer:
-                        formats.append({
+                    continue
-                            'format_id': '%s-%s' % (format_id, resource.get('name')),
+                formats.extend({
-                            'url': streamer,
+                    'format_id': '%s-%s' % (format_id, resource.get('name')),
-                            'play_path': resource['file'],
+                    'url': streamer,
-                            'ext': 'flv',
+                    'play_path': resource['file'],
-                            'width': int_or_none(resource.get('width')),
+                    'ext': 'flv',
-                            'height': int_or_none(resource.get('height')),
+                    'width': int_or_none(resource.get('width')),
-                            'tbr': int_or_none(resource.get('bitrate')),
+                    'height': int_or_none(resource.get('height')),
-                        })
+                    'tbr': int_or_none(resource.get('bitrate')),
                } for resource in resources if resource.get('file'))
        m3u8_formats = list(filter(
            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
            formats))
        if http_url:
            m3u8_formats = [f for f in formats if f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none']
            for m3u8_format in m3u8_formats:
                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
                if not bitrate:
                    continue
                bitrate_url = re.sub(r'\d+k', bitrate, http_url)
                if not self._is_valid_url(
-                        bitrate_url, video_name, '%s bitrate' % bitrate):
+                        bitrate_url, video_id, '%s bitrate' % bitrate):
                    continue
                f = m3u8_format.copy()
                f.update({
@ -289,79 +120,123 @@ class TEDIE(InfoExtractor):
            })
        if not formats:
-            external = player_talk.get('external')
+            external = playerData.get('external') or {}
-            if isinstance(external, dict):
+            service = external.get('service') or ''
-                service = external.get('service')
+            ext_url = external.get('code') if service.lower() == 'youtube' else None
-                if isinstance(service, compat_str):
+            return self.url_result(ext_url or external['uri'])
                    ext_url = None
                    if service.lower() == 'youtube':
                        ext_url = external.get('code')
                    return self.url_result(ext_url or external['uri'])
        self._sort_formats(formats)
-        video_id = compat_str(talk_info['id'])
+        thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage)
        if thumbnail:
            # trim thumbnail resize parameters
            thumbnail = thumbnail.split('?')[0]
        return {
            'id': video_id,
-            'title': title,
+            'title': talk_info.get('title') or self._og_search_title(webpage),
-            'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
+            'uploader': talk_info.get('presenterDisplayName'),
            'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
            'description': self._og_search_description(webpage),
            'subtitles': self._get_subtitles(video_id, talk_info),
            'formats': formats,
            'duration': float_or_none(talk_info.get('duration')),
            'view_count': int_or_none(data.get('viewed_count')),
            'comment_count': int_or_none(
                try_get(data, lambda x: x['comments']['count'])),
            'tags': try_get(talk_info, lambda x: x['tags'], list),
        }
    def _get_subtitles(self, video_id, talk_info):
        sub_lang_list = {}
        for language in try_get(
                talk_info,
                (lambda x: x['downloads']['languages'],
                 lambda x: x['languages']), list):
            lang_code = language.get('languageCode') or language.get('ianaCode')
            if not lang_code:
                continue
            sub_lang_list[lang_code] = [
                {
                    'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
                    'ext': ext,
                }
                for ext in ['ted', 'srt']
            ]
        return sub_lang_list
    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)
        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config', default=None)
        if not config_json:
            embed_url = self._search_regex(
                r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
            return self.url_result(self._proto_relative_url(embed_url))
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')
        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)
        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
-            'description': description,
+            'description': talk_info.get('description') or self._og_search_description(webpage),
            'subtitles': subtitles,
            'formats': formats,
            'duration': talk_info.get('duration') or parse_duration(self._og_search_property('video:duration', webpage)),
            'view_count': str_to_int(talk_info.get('viewedCount')),
            'upload_date': unified_strdate(talk_info.get('publishedAt')),
            'release_date': unified_strdate(talk_info.get('recordedOn')),
            'tags': try_get(playerData, lambda x: x['targeting']['tag'].split(',')),
        }
 class TedSeriesIE(TedBaseIE):
    _VALID_URL = fr'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
    _TESTS = [{
        'url': 'https://www.ted.com/series/small_thing_big_idea',
        'info_dict': {
            'id': '3',
            'title': 'Small Thing Big Idea',
            'series': 'Small Thing Big Idea',
            'description': 'md5:6869ca52cec661aef72b3e9f7441c55c'
        },
        'playlist_mincount': 16,
    }, {
        'url': 'https://www.ted.com/series/the_way_we_work#season_2',
        'info_dict': {
            'id': '8_2',
            'title': 'The Way We Work Season 2',
            'series': 'The Way We Work',
            'description': 'md5:59469256e533e1a48c4aa926a382234c',
            'season_number': 2
        },
        'playlist_mincount': 8,
    }]
    def _real_extract(self, url):
        display_id, season = self._match_valid_url(url).group('id', 'season')
        webpage = self._download_webpage(url, display_id, 'Downloading series webpage')
        info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
        entries = itertools.chain.from_iterable(
            self._parse_playlist(s) for s in info['seasons'] if season in [None, s.get('seasonNumber')])
        series_id = try_get(info, lambda x: x['series']['id'])
        series_name = try_get(info, lambda x: x['series']['name']) or self._og_search_title(webpage, fatal=False)
        return self.playlist_result(
            entries,
            f'{series_id}_{season}' if season and series_id else series_id,
            f'{series_name} Season {season}' if season else series_name,
            self._og_search_description(webpage),
            series=series_name, season_number=int_or_none(season))
 class TedPlaylistIE(TedBaseIE):
    _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type=r'playlists(?:/\d+)?')
    _TESTS = [{
        'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
        'info_dict': {
            'id': '171',
            'title': 'The most popular talks of all time',
            'description': 'md5:d2f22831dc86c7040e733a3cb3993d78'
        },
        'playlist_mincount': 25,
    }]
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        playlist = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['playlist']
        return self.playlist_result(
            self._parse_playlist(playlist), playlist.get('id'),
            playlist.get('title') or self._og_search_title(webpage, default='').replace(' | TED Talks', '') or None,
            self._og_search_description(webpage))
 class TedEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
    _TESTS = [{
        'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
        'info_dict': {
            'id': '21802',
            'ext': 'mp4',
            'title': 'How to get serious about diversity and inclusion in the workplace',
            'description': 'md5:0978aafe396e05341f8ecc795d22189d',
            'view_count': int,
            'tags': list,
            'uploader': 'Janet Stovall',
            'duration': 664.0,
            'upload_date': '20180822',
            'release_date': '20180719',
            'thumbnail': r're:http.*\.jpg',
        },
    }]
    @classmethod
    def _extract_urls(cls, webpage):
        return [mobj.group('url') for mobj in re.finditer(
            fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
    def _real_extract(self, url):
        return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())