From 62b8dac4908bdb340e173bb70048f0f22e825007 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 31 Oct 2022 17:35:20 +0530 Subject: [PATCH] [extractor] Improve `_generic_title` --- yt_dlp/extractor/arte.py | 4 +--- yt_dlp/extractor/bbc.py | 8 ++------ yt_dlp/extractor/breitbart.py | 3 +-- yt_dlp/extractor/callin.py | 4 +--- yt_dlp/extractor/common.py | 8 +++++--- yt_dlp/extractor/cspan.py | 3 +-- yt_dlp/extractor/fivetv.py | 2 +- yt_dlp/extractor/generic.py | 3 +-- yt_dlp/extractor/genericembeds.py | 2 +- yt_dlp/extractor/glide.py | 2 +- yt_dlp/extractor/meipai.py | 4 +--- yt_dlp/extractor/nhk.py | 3 +-- yt_dlp/extractor/onenewsnz.py | 3 +-- yt_dlp/extractor/steam.py | 2 +- yt_dlp/extractor/tennistv.py | 2 +- yt_dlp/extractor/tv24ua.py | 2 +- 16 files changed, 21 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index d3ec4a66c..b60fa0233 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -303,9 +303,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): items.append(video) - title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r']*>([^<]+)', default=None)) - title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, description=self._og_search_description(webpage, default=None)) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 9a0a4414e..89fce8d5a 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -898,12 +898,8 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') - playlist_title = json_ld_info.get('title') - if not playlist_title: - playlist_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'playlist title', default=None)) - if playlist_title: - playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + playlist_title = json_ld_info.get('title') or re.sub( + r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None playlist_description = json_ld_info.get( 'description') or self._og_search_description(webpage, default=None) diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index a2b04fcce..ca5757374 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -27,8 +27,7 @@ class BreitBartIE(InfoExtractor): self._sort_formats(formats) return { 'id': video_id, - 'title': (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title')), + 'title': self._generic_title('', webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': self._rta_search(webpage), diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index fc5da7028..6c8129f06 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -51,9 +51,7 @@ class CallinIE(InfoExtractor): episode = next_data['props']['pageProps']['episode'] id = episode['id'] - title = (episode.get('title') - or self._og_search_title(webpage, fatal=False) - or self._html_extract_title(webpage)) + title = episode.get('title') or self._generic_title('', webpage) url = episode['m3u8'] formats = self._extract_m3u8_formats(url, display_id, ext='ts') self._sort_formats(formats) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index fb787a722..84a2b95af 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3820,9 +3820,11 @@ class InfoExtractor: def _generic_id(url): return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - @staticmethod - def _generic_title(url): - return urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) + def _generic_title(self, url='', webpage='', *, default=None): + return (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, default=None) + or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) + or default) @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index 84393627a..1184633f5 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -275,8 +275,7 @@ class CSpanCongressIE(InfoExtractor): self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), video_id, transform_source=js_to_json) - title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title')) + title = self._generic_title('', webpage) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py index 448c332b3..1f48cfd36 100644 --- a/yt_dlp/extractor/fivetv.py +++ b/yt_dlp/extractor/fivetv.py @@ -71,7 +71,7 @@ class FiveTVIE(InfoExtractor): r']+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) + title = self._generic_title('', webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 5abde33a9..b0b26b61a 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2740,8 +2740,7 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - 'title': (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title', default='video')), + 'title': self._generic_title('', webpage, default='video'), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'age_limit': self._rta_search(webpage), diff --git a/yt_dlp/extractor/genericembeds.py b/yt_dlp/extractor/genericembeds.py index 1bffe275a..45e1618ba 100644 --- a/yt_dlp/extractor/genericembeds.py +++ b/yt_dlp/extractor/genericembeds.py @@ -20,7 +20,7 @@ class HTML5MediaEmbedIE(InfoExtractor): ] def _extract_from_webpage(self, url, webpage): - video_id, title = self._generic_id(url), self._generic_title(url) + video_id, title = self._generic_id(url), self._generic_title(url, webpage) entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] for num, entry in enumerate(entries, start=1): entry.update({ diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py index 2bffb26dc..d114f3494 100644 --- a/yt_dlp/extractor/glide.py +++ b/yt_dlp/extractor/glide.py @@ -20,7 +20,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) + title = self._generic_title('', webpage) video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'video URL', default=None, diff --git a/yt_dlp/extractor/meipai.py b/yt_dlp/extractor/meipai.py index 95b6dfe52..1a6f3cd74 100644 --- a/yt_dlp/extractor/meipai.py +++ b/yt_dlp/extractor/meipai.py @@ -48,9 +48,7 @@ class MeipaiIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r']*>([^<]+)', webpage, 'title') + title = self._generic_title('', webpage) formats = [] diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 60d76d1b1..517660ef1 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -321,8 +321,7 @@ class NhkForSchoolProgramListIE(InfoExtractor): webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) - title = (self._og_search_title(webpage) - or self._html_extract_title(webpage) + title = (self._generic_title('', webpage) or self._html_search_regex(r'

([^<]+?)とは?\s*

', webpage, 'title', fatal=False)) title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None description = self._html_search_regex( diff --git a/yt_dlp/extractor/onenewsnz.py b/yt_dlp/extractor/onenewsnz.py index 59d4490d0..a46211e77 100644 --- a/yt_dlp/extractor/onenewsnz.py +++ b/yt_dlp/extractor/onenewsnz.py @@ -106,7 +106,6 @@ class OneNewsNZIE(InfoExtractor): playlist_title = ( traverse_obj(fusion_metadata, ('headlines', 'basic')) - or self._og_search_title(webpage) - or self._html_extract_title(webpage) + or self._generic_title('', webpage) ) return self.playlist_result(entries, display_id, playlist_title) diff --git a/yt_dlp/extractor/steam.py b/yt_dlp/extractor/steam.py index e15c22f2a..eea20ff85 100644 --- a/yt_dlp/extractor/steam.py +++ b/yt_dlp/extractor/steam.py @@ -166,7 +166,7 @@ class SteamCommunityBroadcastIE(InfoExtractor): self._sort_formats(formats) return { 'id': video_id, - 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'title': self._generic_title('', webpage), 'formats': formats, 'live_status': 'is_live', 'view_count': json_data.get('num_view'), diff --git a/yt_dlp/extractor/tennistv.py b/yt_dlp/extractor/tennistv.py index 5baa21d52..47cb0965e 100644 --- a/yt_dlp/extractor/tennistv.py +++ b/yt_dlp/extractor/tennistv.py @@ -142,7 +142,7 @@ class TennisTVIE(InfoExtractor): return { 'id': video_id, - 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'title': self._generic_title('', webpage), 'description': self._html_search_regex( (r'', *self._og_regexes('description')), webpage, 'description', fatal=False), diff --git a/yt_dlp/extractor/tv24ua.py b/yt_dlp/extractor/tv24ua.py index 2f2571df7..8d2475296 100644 --- a/yt_dlp/extractor/tv24ua.py +++ b/yt_dlp/extractor/tv24ua.py @@ -74,6 +74,6 @@ class TV24UAVideoIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'thumbnail': thumbnail or self._og_search_thumbnail(webpage), - 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'title': self._generic_title('', webpage), 'description': self._og_search_description(webpage, default=None), }