[youtube] More metadata extraction for channels/playlists
This commit is contained in:
parent
18590cecdb
commit
b60419c51a
2 changed files with 53 additions and 24 deletions
|
@ -336,9 +336,8 @@ class InfoExtractor(object):
|
||||||
There must be a key "entries", which is a list, an iterable, or a PagedList
|
There must be a key "entries", which is a list, an iterable, or a PagedList
|
||||||
object, each element of which is a valid dictionary by this specification.
|
object, each element of which is a valid dictionary by this specification.
|
||||||
|
|
||||||
Additionally, playlists can have "id", "title", "description", "uploader",
|
Additionally, playlists can have "id", "title", and any other relevent
|
||||||
"uploader_id", "uploader_url", "duration" attributes with the same semantics
|
attributes with the same semantics as videos (see above).
|
||||||
as videos (see above).
|
|
||||||
|
|
||||||
|
|
||||||
_type "multi_video" indicates that there are multiple videos that
|
_type "multi_video" indicates that there are multiple videos that
|
||||||
|
@ -967,10 +966,11 @@ class InfoExtractor(object):
|
||||||
urls, playlist_id=playlist_id, playlist_title=playlist_title)
|
urls, playlist_id=playlist_id, playlist_title=playlist_title)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
|
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
|
||||||
"""Returns a playlist"""
|
"""Returns a playlist"""
|
||||||
video_info = {'_type': 'playlist',
|
video_info = {'_type': 'playlist',
|
||||||
'entries': entries}
|
'entries': entries}
|
||||||
|
video_info.update(kwargs)
|
||||||
if playlist_id:
|
if playlist_id:
|
||||||
video_info['id'] = playlist_id
|
video_info['id'] = playlist_id
|
||||||
if playlist_title:
|
if playlist_title:
|
||||||
|
|
|
@ -31,6 +31,7 @@ from ..utils import (
|
||||||
clean_html,
|
clean_html,
|
||||||
error_to_compat_str,
|
error_to_compat_str,
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
|
format_field,
|
||||||
float_or_none,
|
float_or_none,
|
||||||
get_element_by_id,
|
get_element_by_id,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
|
@ -2675,6 +2676,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'uploader': video_uploader,
|
'uploader': video_uploader,
|
||||||
'uploader_id': video_uploader_id,
|
'uploader_id': video_uploader_id,
|
||||||
'uploader_url': video_uploader_url,
|
'uploader_url': video_uploader_url,
|
||||||
|
'channel': video_uploader,
|
||||||
'channel_id': channel_id,
|
'channel_id': channel_id,
|
||||||
'channel_url': channel_url,
|
'channel_url': channel_url,
|
||||||
'upload_date': upload_date,
|
'upload_date': upload_date,
|
||||||
|
@ -3402,44 +3404,71 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
|
||||||
uploader['uploader_url'] = urljoin(
|
uploader['uploader_url'] = urljoin(
|
||||||
'https://www.youtube.com/',
|
'https://www.youtube.com/',
|
||||||
try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
|
try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
|
||||||
return uploader
|
return {k:v for k, v in uploader.items() if v is not None}
|
||||||
|
|
||||||
def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
|
def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
|
||||||
|
playlist_id = title = description = channel_url = channel_name = channel_id = None
|
||||||
|
thumbnails_list = tags = []
|
||||||
|
|
||||||
selected_tab = self._extract_selected_tab(tabs)
|
selected_tab = self._extract_selected_tab(tabs)
|
||||||
renderer = try_get(
|
renderer = try_get(
|
||||||
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
|
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
|
||||||
playlist_id = title = description = None
|
|
||||||
if renderer:
|
if renderer:
|
||||||
channel_title = renderer.get('title') or item_id
|
channel_name = renderer.get('title')
|
||||||
tab_title = selected_tab.get('title')
|
channel_url = renderer.get('channelUrl')
|
||||||
title = channel_title or item_id
|
channel_id = renderer.get('externalId')
|
||||||
if tab_title:
|
|
||||||
title += ' - %s' % tab_title
|
|
||||||
description = renderer.get('description')
|
|
||||||
playlist_id = renderer.get('externalId')
|
|
||||||
|
|
||||||
# this has thumbnails, but there is currently no thumbnail field for playlists
|
|
||||||
# sidebar.playlistSidebarRenderer has even more data, but its stucture is more complec
|
|
||||||
renderer = try_get(
|
|
||||||
data, lambda x: x['microformat']['microformatDataRenderer'], dict)
|
|
||||||
if not renderer:
|
if not renderer:
|
||||||
renderer = try_get(
|
renderer = try_get(
|
||||||
data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
|
data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
|
||||||
if renderer:
|
if renderer:
|
||||||
title = renderer.get('title')
|
title = renderer.get('title')
|
||||||
description = renderer.get('description')
|
description = renderer.get('description')
|
||||||
playlist_id = item_id
|
playlist_id = channel_id
|
||||||
|
tags = renderer.get('keywords', '').split()
|
||||||
|
thumbnails_list = (
|
||||||
|
try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
|
||||||
|
or data['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails']
|
||||||
|
or [])
|
||||||
|
|
||||||
|
thumbnails = []
|
||||||
|
for t in thumbnails_list:
|
||||||
|
if not isinstance(t, dict):
|
||||||
|
continue
|
||||||
|
thumbnail_url = url_or_none(t.get('url'))
|
||||||
|
if not thumbnail_url:
|
||||||
|
continue
|
||||||
|
thumbnails.append({
|
||||||
|
'url': thumbnail_url,
|
||||||
|
'width': int_or_none(t.get('width')),
|
||||||
|
'height': int_or_none(t.get('height')),
|
||||||
|
})
|
||||||
|
|
||||||
if playlist_id is None:
|
if playlist_id is None:
|
||||||
playlist_id = item_id
|
playlist_id = item_id
|
||||||
if title is None:
|
if title is None:
|
||||||
title = "Youtube " + playlist_id.title()
|
title = playlist_id
|
||||||
playlist = self.playlist_result(
|
title += format_field(selected_tab, 'title', ' - %s')
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
'playlist_id': playlist_id,
|
||||||
|
'playlist_title': title,
|
||||||
|
'playlist_description': description,
|
||||||
|
'uploader': channel_name,
|
||||||
|
'uploader_id': channel_id,
|
||||||
|
'uploader_url': channel_url,
|
||||||
|
'thumbnails': thumbnails,
|
||||||
|
'tags': tags,
|
||||||
|
}
|
||||||
|
if not channel_id:
|
||||||
|
metadata.update(self._extract_uploader(data))
|
||||||
|
metadata.update({
|
||||||
|
'channel': metadata['uploader'],
|
||||||
|
'channel_id': metadata['uploader_id'],
|
||||||
|
'channel_url': metadata['uploader_url']})
|
||||||
|
return self.playlist_result(
|
||||||
self._entries(selected_tab, identity_token),
|
self._entries(selected_tab, identity_token),
|
||||||
playlist_id=playlist_id, playlist_title=title,
|
**metadata)
|
||||||
playlist_description=description)
|
|
||||||
playlist.update(self._extract_uploader(data))
|
|
||||||
return playlist
|
|
||||||
|
|
||||||
def _extract_from_playlist(self, item_id, url, data, playlist):
|
def _extract_from_playlist(self, item_id, url, data, playlist):
|
||||||
title = playlist.get('title') or try_get(
|
title = playlist.get('title') or try_get(
|
||||||
|
|
Loading…
Reference in a new issue