[Ted] Rewrite extractor (#2359)

Closes #2343
Authored by: pukkandan, trassshhub
This commit is contained in:
trasssh 2022-01-20 00:04:20 +08:00 committed by GitHub
parent dfb7f2a25d
commit 4259402c56
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 194 additions and 314 deletions

View file

@ -13,7 +13,7 @@ from test.helper import FakeYDL, md5, is_download_test
from yt_dlp.extractor import (
YoutubeIE,
DailymotionIE,
TEDIE,
TedTalkIE,
VimeoIE,
WallaIE,
CeskaTelevizeIE,
@ -141,7 +141,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
@is_download_test
class TestTedSubtitles(BaseTestSubtitles):
url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
IE = TEDIE
IE = TedTalkIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True

View file

@ -1522,7 +1522,12 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .teamtreehouse import TeamTreeHouseIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .ted import (
TedEmbedIE,
TedPlaylistIE,
TedSeriesIE,
TedTalkIE,
)
from .tele5 import Tele5IE
from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE

View file

@ -115,6 +115,7 @@ from .channel9 import Channel9IE
from .vshare import VShareIE
from .mediasite import MediasiteIE
from .springboardplatform import SpringboardPlatformIE
from .ted import TedEmbedIE
from .yapfiles import YapFilesIE
from .vice import ViceIE
from .xfileshare import XFileShareIE
@ -3174,10 +3175,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Tvigle')
# Look for embedded TED player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
ted_urls = TedEmbedIE._extract_urls(webpage)
if ted_urls:
return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
# Look for embedded Ustream videos
ustream_url = UstreamIE._extract_url(webpage)

View file

@ -1,233 +1,67 @@
from __future__ import unicode_literals
import json
import itertools
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse
)
from ..utils import (
extract_attributes,
float_or_none,
int_or_none,
str_to_int,
try_get,
url_or_none,
unified_strdate,
parse_duration,
)
class TEDIE(InfoExtractor):
IE_NAME = 'ted'
_VALID_URL = r'''(?x)
(?P<proto>https?://)
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
(
(?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
|
(?P<type_watch>watch)/[^/]+/[^/]+
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>[\w-]+) # Here goes the name and then ".html"
.*)$
'''
class TedBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
def _parse_playlist(self, playlist):
for entry in try_get(playlist, lambda x: x['videos']['nodes'], list):
if entry.get('__typename') == 'Video' and entry.get('canonicalUrl'):
yield self.url_result(entry['canonicalUrl'], TedTalkIE.ie_key())
class TedTalkIE(TedBaseIE):
_VALID_URL = TedBaseIE._VALID_URL_BASE.format(type='talks')
_TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': 'b0ce2b05ca215042124fbc9e3886493a',
'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
'md5': '47e82c666d9c3261d4fe74748a90aada',
'info_dict': {
'id': '102',
'id': '86532',
'ext': 'mp4',
'title': 'The illusion of consciousness',
'description': ('Philosopher Dan Dennett makes a compelling '
'argument that not only don\'t we understand our own '
'consciousness, but that half the time our brains are '
'actively fooling us.'),
'uploader': 'Dan Dennett',
'width': 853,
'duration': 1308,
'title': 'How to break down barriers and not accept limits',
'description': 'md5:000707cece219d1e165b11550d612331',
'view_count': int,
'comment_count': int,
'tags': list,
'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
'uploader': 'Candace Parker',
'duration': 676.0,
'upload_date': '20220114',
'release_date': '20211201',
'thumbnail': r're:http.*\.jpg',
},
'params': {
'skip_download': True,
},
}, {
# missing HTTP bitrates
'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
'info_dict': {
'id': '6069',
'ext': 'mp4',
'title': 'The beauty and power of algorithms',
'thumbnail': r're:^https?://.+\.jpg',
'description': 'md5:734e352710fb00d840ab87ae31aaf688',
'uploader': 'Vishal Sikka',
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
'info_dict': {
'id': '1972',
'ext': 'mp4',
'title': 'Be passionate. Be courageous. Be your best.',
'uploader': 'Gabby Giffords and Mark Kelly',
'description': 'md5:5174aed4d0f16021b704120360f72b92',
'duration': 1128,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers',
'info_dict': {
'id': '10',
'title': 'Who are the hackers?',
'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
},
'playlist_mincount': 6,
}, {
# contains a youtube video
'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
'add_ie': ['Youtube'],
'info_dict': {
'id': '_ZG8HBuDjgc',
'ext': 'webm',
'title': 'Douglas Adams: Parrots the Universe and Everything',
'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
'uploader': 'University of California Television (UCTV)',
'uploader_id': 'UCtelevision',
'upload_date': '20080522',
},
'params': {
'skip_download': True,
},
}, {
# no nativeDownloads
'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
'info_dict': {
'id': '1792',
'ext': 'mp4',
'title': 'The orchestra in my mouth',
'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
'uploader': 'Tom Thum',
'view_count': int,
'comment_count': int,
'tags': list,
},
'params': {
'skip_download': True,
},
}, {
# with own formats and private Youtube external
'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
'only_matching': True,
}]
_NATIVE_FORMATS = {
'low': {'width': 320, 'height': 180},
'medium': {'width': 512, 'height': 288},
'high': {'width': 854, 'height': 480},
}
def _extract_info(self, webpage):
info_json = self._search_regex(
r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
if m.group('type').startswith('embed'):
desktop_url = m.group('proto') + 'www' + m.group('urlmain')
return self.url_result(desktop_url, 'TED')
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url, name)
elif m.group('type_watch'):
return self._watch_info(url, name)
else:
return self._playlist_videos_info(url, name)
def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
webpage = self._download_webpage(url, name,
'Downloading playlist webpage')
playlist_entries = []
for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
attrs = extract_attributes(entry)
entry_url = compat_urlparse.urljoin(url, attrs['href'])
playlist_entries.append(self.url_result(entry_url, self.ie_key()))
final_url = self._og_search_url(webpage, fatal=False)
playlist_id = (
re.match(self._VALID_URL, final_url).group('playlist_id')
if final_url else None)
return self.playlist_result(
playlist_entries, playlist_id=playlist_id,
playlist_title=self._og_search_title(webpage, fatal=False),
playlist_description=self._og_search_description(webpage))
def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name)
info = self._extract_info(webpage)
data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
talk_info = data['talks'][0]
title = talk_info['title'].strip()
downloads = talk_info.get('downloads') or {}
native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
formats = [{
'url': format_url,
'format_id': format_id,
} for (format_id, format_url) in native_downloads.items() if format_url is not None]
subtitled_downloads = downloads.get('subtitledDownloads') or {}
for lang, subtitled_download in subtitled_downloads.items():
for q in self._NATIVE_FORMATS:
q_url = subtitled_download.get(q)
if not q_url:
continue
formats.append({
'url': q_url,
'format_id': '%s-%s' % (q, lang),
'language': lang,
})
if formats:
for f in formats:
finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
if finfo:
f.update(finfo)
player_talk = talk_info['player_talks'][0]
resources_ = player_talk.get('resources') or talk_info.get('resources')
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
talk_info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['videoData']
video_id = talk_info['id']
playerData = self._parse_json(talk_info.get('playerData'), video_id)
http_url = None
for format_id, resources in resources_.items():
formats, subtitles = [], {}
for format_id, resources in (playerData.get('resources') or {}).items():
if format_id == 'hls':
if not isinstance(resources, dict):
continue
stream_url = url_or_none(resources.get('stream'))
stream_url = url_or_none(try_get(resources, lambda x: x['stream']))
if not stream_url:
continue
formats.extend(self._extract_m3u8_formats(
stream_url, video_name, 'mp4', m3u8_id=format_id,
fatal=False))
else:
m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
stream_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
formats.extend(m3u8_formats)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
continue
if not isinstance(resources, list):
continue
if format_id == 'h264':
@ -247,8 +81,7 @@ class TEDIE(InfoExtractor):
streamer = talk_info.get('streamer')
if not streamer:
continue
for resource in resources:
formats.append({
formats.extend({
'format_id': '%s-%s' % (format_id, resource.get('name')),
'url': streamer,
'play_path': resource['file'],
@ -256,19 +89,17 @@ class TEDIE(InfoExtractor):
'width': int_or_none(resource.get('width')),
'height': int_or_none(resource.get('height')),
'tbr': int_or_none(resource.get('bitrate')),
})
} for resource in resources if resource.get('file'))
m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
m3u8_formats = [f for f in formats if f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none']
for m3u8_format in m3u8_formats:
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate:
continue
bitrate_url = re.sub(r'\d+k', bitrate, http_url)
if not self._is_valid_url(
bitrate_url, video_name, '%s bitrate' % bitrate):
bitrate_url, video_id, '%s bitrate' % bitrate):
continue
f = m3u8_format.copy()
f.update({
@ -289,79 +120,123 @@ class TEDIE(InfoExtractor):
})
if not formats:
external = player_talk.get('external')
if isinstance(external, dict):
service = external.get('service')
if isinstance(service, compat_str):
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
external = playerData.get('external') or {}
service = external.get('service') or ''
ext_url = external.get('code') if service.lower() == 'youtube' else None
return self.url_result(ext_url or external['uri'])
self._sort_formats(formats)
video_id = compat_str(talk_info['id'])
thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage)
if thumbnail:
# trim thumbnail resize parameters
thumbnail = thumbnail.split('?')[0]
return {
'id': video_id,
'title': title,
'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats,
'duration': float_or_none(talk_info.get('duration')),
'view_count': int_or_none(data.get('viewed_count')),
'comment_count': int_or_none(
try_get(data, lambda x: x['comments']['count'])),
'tags': try_get(talk_info, lambda x: x['tags'], list),
}
def _get_subtitles(self, video_id, talk_info):
sub_lang_list = {}
for language in try_get(
talk_info,
(lambda x: x['downloads']['languages'],
lambda x: x['languages']), list):
lang_code = language.get('languageCode') or language.get('ianaCode')
if not lang_code:
continue
sub_lang_list[lang_code] = [
{
'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
'ext': ext,
}
for ext in ['ted', 'srt']
]
return sub_lang_list
def _watch_info(self, url, name):
webpage = self._download_webpage(url, name)
config_json = self._html_search_regex(
r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
webpage, 'config', default=None)
if not config_json:
embed_url = self._search_regex(
r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
return self.url_result(self._proto_relative_url(embed_url))
config = json.loads(config_json)['config']
video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url')
title = self._html_search_regex(
r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
description = self._html_search_regex(
[
r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
],
webpage, 'description', fatal=False)
return {
'id': name,
'url': video_url,
'title': title,
'title': talk_info.get('title') or self._og_search_title(webpage),
'uploader': talk_info.get('presenterDisplayName'),
'thumbnail': thumbnail,
'description': description,
'description': talk_info.get('description') or self._og_search_description(webpage),
'subtitles': subtitles,
'formats': formats,
'duration': talk_info.get('duration') or parse_duration(self._og_search_property('video:duration', webpage)),
'view_count': str_to_int(talk_info.get('viewedCount')),
'upload_date': unified_strdate(talk_info.get('publishedAt')),
'release_date': unified_strdate(talk_info.get('recordedOn')),
'tags': try_get(playerData, lambda x: x['targeting']['tag'].split(',')),
}
class TedSeriesIE(TedBaseIE):
_VALID_URL = fr'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
_TESTS = [{
'url': 'https://www.ted.com/series/small_thing_big_idea',
'info_dict': {
'id': '3',
'title': 'Small Thing Big Idea',
'series': 'Small Thing Big Idea',
'description': 'md5:6869ca52cec661aef72b3e9f7441c55c'
},
'playlist_mincount': 16,
}, {
'url': 'https://www.ted.com/series/the_way_we_work#season_2',
'info_dict': {
'id': '8_2',
'title': 'The Way We Work Season 2',
'series': 'The Way We Work',
'description': 'md5:59469256e533e1a48c4aa926a382234c',
'season_number': 2
},
'playlist_mincount': 8,
}]
def _real_extract(self, url):
display_id, season = self._match_valid_url(url).group('id', 'season')
webpage = self._download_webpage(url, display_id, 'Downloading series webpage')
info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
entries = itertools.chain.from_iterable(
self._parse_playlist(s) for s in info['seasons'] if season in [None, s.get('seasonNumber')])
series_id = try_get(info, lambda x: x['series']['id'])
series_name = try_get(info, lambda x: x['series']['name']) or self._og_search_title(webpage, fatal=False)
return self.playlist_result(
entries,
f'{series_id}_{season}' if season and series_id else series_id,
f'{series_name} Season {season}' if season else series_name,
self._og_search_description(webpage),
series=series_name, season_number=int_or_none(season))
class TedPlaylistIE(TedBaseIE):
_VALID_URL = TedBaseIE._VALID_URL_BASE.format(type=r'playlists(?:/\d+)?')
_TESTS = [{
'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
'info_dict': {
'id': '171',
'title': 'The most popular talks of all time',
'description': 'md5:d2f22831dc86c7040e733a3cb3993d78'
},
'playlist_mincount': 25,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
playlist = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['playlist']
return self.playlist_result(
self._parse_playlist(playlist), playlist.get('id'),
playlist.get('title') or self._og_search_title(webpage, default='').replace(' | TED Talks', '') or None,
self._og_search_description(webpage))
class TedEmbedIE(InfoExtractor):
_VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
_TESTS = [{
'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
'info_dict': {
'id': '21802',
'ext': 'mp4',
'title': 'How to get serious about diversity and inclusion in the workplace',
'description': 'md5:0978aafe396e05341f8ecc795d22189d',
'view_count': int,
'tags': list,
'uploader': 'Janet Stovall',
'duration': 664.0,
'upload_date': '20180822',
'release_date': '20180719',
'thumbnail': r're:http.*\.jpg',
},
}]
@classmethod
def _extract_urls(cls, webpage):
return [mobj.group('url') for mobj in re.finditer(
fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
def _real_extract(self, url):
return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())