From bed30106f544fb3ae995f0e3e73cf39789edeecc Mon Sep 17 00:00:00 2001 From: foghawk Date: Fri, 4 Mar 2022 21:24:49 -0600 Subject: [PATCH] [tumblr] Fix extractor (#2883) Authored by: foghawk --- yt_dlp/extractor/tumblr.py | 350 ++++++++++++++++++++++++++++--------- 1 file changed, 264 insertions(+), 86 deletions(-) diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py index a9ad2e513..a3e0e15f2 100644 --- a/yt_dlp/extractor/tumblr.py +++ b/yt_dlp/extractor/tumblr.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + traverse_obj, urlencode_postdata ) @@ -14,31 +15,130 @@ class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P[0-9]+)(?:$|[/?#])' _NETRC_MACHINE = 'tumblr' _LOGIN_URL = 'https://www.tumblr.com/login' + _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', 'info_dict': { 'id': '54196191430', 'ext': 'mp4', - 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', + 'title': 'md5:dfac39636969fe6bf1caa2d50405f069', 'description': 'md5:390ab77358960235b6937ab3b8528956', + 'uploader_id': 'tatianamaslanydaily', + 'uploader_url': 'https://tatianamaslanydaily.tumblr.com/', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 127, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': ['Orphan Black', 'Tatiana Maslany', 'Interview', 'Video', 'OB S1 DVD Extras'], } }, { + 'note': 'multiple formats', 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english', 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68', 'info_dict': { 'id': '626907179849564160', 'ext': 'mp4', - 'title': 'Me roast is buggered!, Mona\xa0“talking” in\xa0“english”', + 'title': 'Mona\xa0“talking” in\xa0“english”', 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c', + 'uploader_id': 'maskofthedragon', + 'uploader_url': 'https://maskofthedragon.tumblr.com/', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 7, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': 'count:19', }, 'params': { 'format': 'hd', }, + }, { + 'note': 'non-iframe video (with related posts)', + 'url': 'https://shieldfoss.tumblr.com/post/675519763813908480', + 'md5': '12bdb75661ef443bffe5a4dac1dbf118', + 'info_dict': { + 'id': '675519763813908480', + 'ext': 'mp4', + 'title': 'Shieldfoss', + 'uploader_id': 'nerviovago', + 'uploader_url': 'https://nerviovago.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': [], + } + }, { + 'note': 'dashboard only (original post)', + 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating', + 'md5': '029f7c91ab386701b211e3d494d2d95e', + 'info_dict': { + 'id': '159704441298', + 'ext': 'mp4', + 'title': 'md5:ba79365861101f4911452728d2950561', + 'description': 'md5:773738196cea76b6996ec71e285bdabc', + 'uploader_id': 'jujanon', + 'uploader_url': 'https://jujanon.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': ['crabs', 'my video', 'my pets'], + } + }, { + 'note': 'dashboard only (reblog)', + 'url': 'https://bartlebyshop.tumblr.com/post/180294460076/duality-of-bird', + 'md5': '04334e7cadb1af680d162912559f51a5', + 'info_dict': { + 'id': '180294460076', + 'ext': 'mp4', + 'title': 'duality of bird', + 'description': 'duality of bird', + 'uploader_id': 'todaysbird', + 'uploader_url': 'https://todaysbird.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': [], + } + }, { + 'note': 'dashboard only (external)', + 'url': 'https://afloweroutofstone.tumblr.com/post/675661759168823296/the-blues-remembers-everything-the-country-forgot', + 'info_dict': { + 'id': 'q67_fd7b8SU', + 'ext': 'mp4', + 'title': 'The Blues Remembers Everything the Country Forgot', + 'alt_title': 'The Blues Remembers Everything the Country Forgot', + 'description': 'md5:1a6b4097e451216835a24c1023707c79', + 'release_date': '20201224', + 'creator': 'md5:c2239ba15430e87c3b971ba450773272', + 'uploader': 'Moor Mother - Topic', + 'upload_date': '20201223', + 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w', + 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', + 'thumbnail': r're:^https?://i.ytimg.com/.*', + 'channel': 'Moor Mother - Topic', + 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w', + 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', + 'channel_follower_count': int, + 'duration': 181, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'categories': ['Music'], + 'tags': 'count:7', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'public', + 'track': 'The Blues Remembers Everything the Country Forgot', + 'artist': 'md5:c2239ba15430e87c3b971ba450773272', + 'album': 'Brass', + 'release_year': 2020, + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', @@ -54,17 +154,48 @@ class TumblrIE(InfoExtractor): }, # 'add_ie': ['Vidme'], 'skip': 'dead embedded video host' + }, { + 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like', + 'md5': 'a0063fc8110e6c9afe44065b4ea68177', + 'info_dict': { + 'id': 'eomhW5MLGWA', + 'ext': 'mp4', + 'title': 'what recording voice acting sounds like', + 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798', + 'uploader': 'ProZD', + 'upload_date': '20220112', + 'uploader_id': 'ProZD', + 'uploader_url': 'http://www.youtube.com/user/ProZD', + 'thumbnail': r're:^https?://i.ytimg.com/.*', + 'channel': 'ProZD', + 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA', + 'channel_url': 'https://www.youtube.com/channel/UC6MFZAOHXlKK1FI7V0XQVeA', + 'channel_follower_count': int, + 'duration': 20, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'categories': ['Film & Animation'], + 'tags': [], + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'public', + }, + 'add_ie': ['Youtube'], }, { 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool', - 'md5': '5e45724c70b748f64f5a1731ac72c84a', + 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8', 'info_dict': { 'id': '87816359', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Harold Ramis', + 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c', 'uploader': 'Resolution Productions Group', 'uploader_id': 'resolutionproductions', 'uploader_url': 'https://vimeo.com/resolutionproductions', + 'upload_date': '20140227', 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*', + 'timestamp': 1393523719, 'duration': 291, }, 'add_ie': ['Vimeo'], @@ -107,116 +238,163 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + _providers = { + 'instagram': 'Instagram', + 'vimeo': 'Vimeo', + 'vine': 'Vine', + 'youtube': 'Youtube', + } + + _ACCESS_TOKEN = None + def _real_initialize(self): + self.get_access_token() self._login() + def get_access_token(self): + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page', fatal=False) + if login_page: + self._ACCESS_TOKEN = self._search_regex( + r'"API_TOKEN":\s*"(\w+)"', login_page, 'API access token', fatal=False) + if not self._ACCESS_TOKEN: + self.report_warning('Failed to get access token; metadata will be missing and some videos may not work') + def _login(self): username, password = self._get_login_info() - if username is None: + if not username: return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') + if not self._ACCESS_TOKEN: + return - login_form = self._hidden_inputs(login_page) - login_form.update({ - 'user[email]': username, - 'user[password]': password - }) - - response, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ + self._download_json( + self._OAUTH_URL, None, 'Logging in', + data=urlencode_postdata({ + 'password': password, + 'grant_type': 'password', + 'username': username, + }), headers={ 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL, - }) - - # Successful login - if '/dashboard' in urlh.geturl(): - return - - login_errors = self._parse_json( - self._search_regex( - r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, - 'login errors', default='[]'), - None, fatal=False) - if login_errors: - raise ExtractorError( - 'Unable to login: %s' % login_errors[0], expected=True) - - self.report_warning('Login has probably failed') + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + }, + errnote='Login failed', fatal=False) def _real_extract(self, url): - m_url = self._match_valid_url(url) - video_id = m_url.group('id') - blog = m_url.group('blog_name') + blog, video_id = self._match_valid_url(url).groups() - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) + url = f'http://{blog}.tumblr.com/post/{video_id}/' webpage, urlh = self._download_webpage_handle(url, video_id) redirect_url = urlh.geturl() - if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): - raise ExtractorError( - 'This Tumblr may contain sensitive media. ' - 'Disable safe mode in your account settings ' - 'at https://www.tumblr.com/settings/account#safe_mode', - expected=True) + api_only = bool(self._search_regex( + r'(tumblr.com|^)/(safe-mode|login_required|blog/view)', + redirect_url, 'redirect', default=None)) + + if api_only and not self._ACCESS_TOKEN: + raise ExtractorError('Cannot get data for dashboard-only post without access token') + + post_json = {} + if self._ACCESS_TOKEN: + post_json = traverse_obj( + self._download_json( + f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink', + video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False), + ('response', 'timeline', 'elements', 0)) or {} + content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or [] + video_json = next( + (item for item in content_json if item.get('type') == 'video'), {}) + media_json = video_json.get('media') or {} + if api_only and not media_json.get('url') and not video_json.get('url'): + raise ExtractorError('Failed to find video data for dashboard-only post') + + if not media_json.get('url') and video_json.get('url'): + # external video host + return self.url_result( + video_json['url'], + self._providers.get(video_json.get('provider'), 'Generic')) + + video_url = self._og_search_video_url(webpage, default=None) + duration = None + formats = [] + + # iframes can supply duration and sometimes additional formats, so check for one iframe_url = self._search_regex( - r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', + fr'src=\'(https?://www\.tumblr\.com/video/{blog}/{video_id}/[^\']+)\'', webpage, 'iframe url', default=None) - if iframe_url is None: + if iframe_url: + iframe = self._download_webpage( + iframe_url, video_id, 'Downloading iframe page', + headers={'Referer': redirect_url}) + + options = self._parse_json( + self._search_regex( + r'data-crt-options=(["\'])(?P.+?)\1', iframe, + 'hd video url', default='', group='options'), + video_id, fatal=False) + if options: + duration = int_or_none(options.get('duration')) + + hd_url = options.get('hdUrl') + if hd_url: + # there are multiple formats; extract them + # ignore other sources of width/height data as they may be wrong + sources = [] + sd_url = self._search_regex( + r']+src=(["\'])(?P.+?)\1', iframe, + 'sd video url', default=None, group='url') + if sd_url: + sources.append((sd_url, 'sd')) + sources.append((hd_url, 'hd')) + + formats = [{ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'_(\d+)\.\w+$', video_url, 'height', default=None)), + 'quality': quality, + } for quality, (video_url, format_id) in enumerate(sources)] + + if not media_json.get('url') and not video_url and not iframe_url: + # external video host (but we weren't able to figure it out from the api) iframe_url = self._search_regex( r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']', webpage, 'embed iframe url', default=None) return self.url_result(iframe_url or redirect_url, 'Generic') - iframe = self._download_webpage( - iframe_url, video_id, 'Downloading iframe page', - headers={'Referer': redirect_url}) - - duration = None - sources = [] - - sd_url = self._search_regex( - r']+src=(["\'])(?P.+?)\1', iframe, - 'sd video url', default=None, group='url') - if sd_url: - sources.append((sd_url, 'sd')) - - options = self._parse_json( - self._search_regex( - r'data-crt-options=(["\'])(?P.+?)\1', iframe, - 'hd video url', default='', group='options'), - video_id, fatal=False) - if options: - duration = int_or_none(options.get('duration')) - hd_url = options.get('hdUrl') - if hd_url: - sources.append((hd_url, 'hd')) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': format_id, - 'height': int_or_none(self._search_regex( - r'/(\d{3,4})$', video_url, 'height', default=None)), - 'quality': quality, - } for quality, (video_url, format_id) in enumerate(sources)] - + formats = formats or [{ + 'url': media_json.get('url') or video_url, + 'width': int_or_none( + media_json.get('width') or self._og_search_property('video:width', webpage, default=None)), + 'height': int_or_none( + media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), + }] self._sort_formats(formats) - # The only place where you can get a title, it's not complete, - # but searching in other places doesn't work for all videos - video_title = self._html_search_regex( - r'(?s)(?P<title>.*?)(?: \| Tumblr)?', - webpage, 'title') + # the url we're extracting from might be an original post or it might be a reblog. + # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. + # content_json is always the op, so if it exists but has no text, there's no description + if content_json: + description = '\n\n'.join(( + item.get('text') for item in content_json if item.get('type') == 'text')) or None + else: + description = self._og_search_description(webpage, default=None) + uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name') return { 'id': video_id, - 'title': video_title, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex( + r'(?s)(?P<title>.*?)(?: \| Tumblr)?', webpage, 'title')), + 'description': description, + 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url')) + or self._og_search_thumbnail(webpage, default=None)), + 'uploader_id': uploader_id, + 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None, 'duration': duration, + 'like_count': post_json.get('like_count'), + 'repost_count': post_json.get('reblog_count'), + 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')), + 'tags': post_json.get('tags'), 'formats': formats, }