[ertgr] Add new extractors (#2338)

Authored-by: zmousm, dirkf
This commit is contained in:
Zenon Mousmoulas 2022-02-01 09:32:13 +02:00 committed by GitHub
parent 48416bc4a8
commit 2d49720f89
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 338 additions and 0 deletions

316
yt_dlp/extractor/ertgr.py Normal file
View file

@ -0,0 +1,316 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
determine_ext,
ExtractorError,
dict_get,
int_or_none,
merge_dicts,
parse_qs,
parse_age_limit,
parse_iso8601,
str_or_none,
try_get,
unescapeHTML,
url_or_none,
variadic,
)
class ERTFlixBaseIE(InfoExtractor):
def _call_api(
self, video_id, method='Player/AcquireContent', api_version=1,
param_headers=None, data=None, headers=None, **params):
platform_codename = {'platformCodename': 'www'}
headers_as_param = {'X-Api-Date-Format': 'iso', 'X-Api-Camel-Case': False}
headers_as_param.update(param_headers or {})
headers = headers or {}
if data:
headers['Content-Type'] = headers_as_param['Content-Type'] = 'application/json;charset=utf-8'
data = json.dumps(merge_dicts(platform_codename, data)).encode('utf-8')
query = merge_dicts(
{} if data else platform_codename,
{'$headers': json.dumps(headers_as_param)},
params)
response = self._download_json(
'https://api.app.ertflix.gr/v%s/%s' % (str(api_version), method),
video_id, fatal=False, query=query, data=data, headers=headers)
if try_get(response, lambda x: x['Result']['Success']) is True:
return response
def _call_api_get_tiles(self, video_id, *tile_ids):
requested_tile_ids = [video_id] + list(tile_ids)
requested_tiles = [{'Id': tile_id} for tile_id in requested_tile_ids]
tiles_response = self._call_api(
video_id, method='Tile/GetTiles', api_version=2,
data={'RequestedTiles': requested_tiles})
tiles = try_get(tiles_response, lambda x: x['Tiles'], list) or []
if tile_ids:
if sorted([tile['Id'] for tile in tiles]) != sorted(requested_tile_ids):
raise ExtractorError('Requested tiles not found', video_id=video_id)
return tiles
try:
return next(tile for tile in tiles if tile['Id'] == video_id)
except StopIteration:
raise ExtractorError('No matching tile found', video_id=video_id)
class ERTFlixCodenameIE(ERTFlixBaseIE):
IE_NAME = 'ertflix:codename'
IE_DESC = 'ERTFLIX videos by codename'
_VALID_URL = r'ertflix:(?P<id>[\w-]+)'
_TESTS = [{
'url': 'ertflix:monogramma-praxitelis-tzanoylinos',
'md5': '5b9c2cd171f09126167e4082fc1dd0ef',
'info_dict': {
'id': 'monogramma-praxitelis-tzanoylinos',
'ext': 'mp4',
'title': 'md5:ef0b439902963d56c43ac83c3f41dd0e',
},
},
]
def _extract_formats_and_subs(self, video_id, allow_none=True):
media_info = self._call_api(video_id, codename=video_id)
formats, subs = [], {}
for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []:
for media in try_get(media_file, lambda x: x['Formats'], list) or []:
fmt_url = url_or_none(try_get(media, lambda x: x['Url']))
if not fmt_url:
continue
ext = determine_ext(fmt_url)
if ext == 'm3u8':
formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False)
elif ext == 'mpd':
formats_, subs_ = self._extract_mpd_formats_and_subtitles(
fmt_url, video_id, mpd_id='dash', fatal=False)
else:
formats.append({
'url': fmt_url,
'format_id': str_or_none(media.get('Id')),
})
continue
formats.extend(formats_)
self._merge_subtitles(subs_, target=subs)
if formats or not allow_none:
self._sort_formats(formats)
return formats, subs
def _real_extract(self, url):
video_id = self._match_id(url)
formats, subs = self._extract_formats_and_subs(video_id)
if formats:
return {
'id': video_id,
'formats': formats,
'subtitles': subs,
'title': self._generic_title(url),
}
class ERTFlixIE(ERTFlixBaseIE):
IE_NAME = 'ertflix'
IE_DESC = 'ERTFLIX videos'
_VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)'
_TESTS = [{
'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates',
'md5': '6479d5e60fd7e520b07ba5411dcdd6e7',
'info_dict': {
'id': 'aoratoi-ergates',
'ext': 'mp4',
'title': 'md5:c1433d598fbba0211b0069021517f8b4',
'description': 'md5:01a64d113c31957eb7eb07719ab18ff4',
'thumbnail': r're:https?://.+\.jpg',
'episode_id': 'vod.173258',
'timestamp': 1639648800,
'upload_date': '20211216',
'duration': 3166,
'age_limit': 8,
},
}, {
'url': 'https://www.ertflix.gr/series/ser.3448-monogramma',
'info_dict': {
'id': 'ser.3448',
'age_limit': 8,
'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
'title': 'Μονόγραμμα',
},
'playlist_mincount': 64,
}, {
'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1',
'info_dict': {
'id': 'ser.3448',
'age_limit': 8,
'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
'title': 'Μονόγραμμα',
},
'playlist_count': 22,
}, {
'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1&season=2021%20-%202022',
'info_dict': {
'id': 'ser.3448',
'age_limit': 8,
'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
'title': 'Μονόγραμμα',
},
'playlist_mincount': 36,
}, {
'url': 'https://www.ertflix.gr/series/ser.164991-to-diktuo-1?season=1-9',
'info_dict': {
'id': 'ser.164991',
'age_limit': 8,
'description': 'Η πρώτη ελληνική εκπομπή με θεματολογία αποκλειστικά γύρω από το ίντερνετ.',
'title': 'Το δίκτυο',
},
'playlist_mincount': 9,
}]
def _extract_episode(self, episode):
codename = try_get(episode, lambda x: x['Codename'], compat_str)
title = episode.get('Title')
description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription', )))
if not codename or not title or not episode.get('HasPlayableStream', True):
return
thumbnail = next((
url_or_none(thumb.get('Url'))
for thumb in variadic(dict_get(episode, ('Images', 'Image')) or {})
if thumb.get('IsMain')),
None)
return {
'_type': 'url_transparent',
'thumbnail': thumbnail,
'id': codename,
'episode_id': episode.get('Id'),
'title': title,
'alt_title': episode.get('Subtitle'),
'description': description,
'timestamp': parse_iso8601(episode.get('PublishDate')),
'duration': episode.get('DurationSeconds'),
'age_limit': self._parse_age_rating(episode),
'url': 'ertflix:%s' % (codename, ),
}
@staticmethod
def _parse_age_rating(info_dict):
return parse_age_limit(
info_dict.get('AgeRating')
or (info_dict.get('IsAdultContent') and 18)
or (info_dict.get('IsKidsContent') and 0))
def _extract_series(self, video_id, season_titles=None, season_numbers=None):
media_info = self._call_api(video_id, method='Tile/GetSeriesDetails', id=video_id)
series = try_get(media_info, lambda x: x['Series'], dict) or {}
series_info = {
'age_limit': self._parse_age_rating(series),
'title': series.get('Title'),
'description': dict_get(series, ('ShortDescription', 'TinyDescription', )),
}
if season_numbers:
season_titles = season_titles or []
for season in try_get(series, lambda x: x['Seasons'], list) or []:
if season.get('SeasonNumber') in season_numbers and season.get('Title'):
season_titles.append(season['Title'])
def gen_episode(m_info, season_titles):
for episode_group in try_get(m_info, lambda x: x['EpisodeGroups'], list) or []:
if season_titles and episode_group.get('Title') not in season_titles:
continue
episodes = try_get(episode_group, lambda x: x['Episodes'], list)
if not episodes:
continue
season_info = {
'season': episode_group.get('Title'),
'season_number': int_or_none(episode_group.get('SeasonNumber')),
}
try:
episodes = [(int(ep['EpisodeNumber']), ep) for ep in episodes]
episodes.sort()
except (KeyError, ValueError):
episodes = enumerate(episodes, 1)
for n, episode in episodes:
info = self._extract_episode(episode)
if info is None:
continue
info['episode_number'] = n
info.update(season_info)
yield info
return self.playlist_result(
gen_episode(media_info, season_titles), playlist_id=video_id, **series_info)
def _real_extract(self, url):
video_id = self._match_id(url)
if video_id.startswith('ser.'):
param_season = parse_qs(url).get('season', [None])
param_season = [
(have_number, int_or_none(v) if have_number else str_or_none(v))
for have_number, v in
[(int_or_none(ps) is not None, ps) for ps in param_season]
if v is not None
]
season_kwargs = {
k: [v for is_num, v in param_season if is_num is c] or None
for k, c in
[('season_titles', False), ('season_numbers', True)]
}
return self._extract_series(video_id, **season_kwargs)
return self._extract_episode(self._call_api_get_tiles(video_id))
class ERTWebtvEmbedIE(InfoExtractor):
IE_NAME = 'ertwebtv:embed'
IE_DESC = 'ert.gr webtv embedded videos'
_BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php')
_VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)'
_TESTS = [{
'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg',
'md5': 'f9e9900c25c26f4ecfbddbb4b6305854',
'info_dict': {
'id': 'trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4',
'title': 'md5:914f06a73cd8b62fbcd6fb90c636e497',
'ext': 'mp4',
'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg'
},
}]
@classmethod
def _extract_urls(cls, webpage):
EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)'
for mobj in re.finditer(EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if not cls.suitable(url):
continue
yield url
def _real_extract(self, url):
video_id = self._match_id(url)
formats, subs = self._extract_m3u8_formats_and_subtitles(
f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8',
video_id, 'mp4')
self._sort_formats(formats)
thumbnail_id = parse_qs(url).get('bgimg', [None])[0]
if thumbnail_id and not thumbnail_id.startswith('http'):
thumbnail_id = f'https://program.ert.gr{thumbnail_id}'
return {
'id': video_id,
'title': f'VOD - {video_id}',
'thumbnail': thumbnail_id,
'formats': formats,
'subtitles': subs,
}

View file

@ -433,6 +433,11 @@ from .eroprofile import (
EroProfileIE, EroProfileIE,
EroProfileAlbumIE, EroProfileAlbumIE,
) )
from .ertgr import (
ERTFlixCodenameIE,
ERTFlixIE,
ERTWebtvEmbedIE,
)
from .escapist import EscapistIE from .escapist import EscapistIE
from .espn import ( from .espn import (
ESPNIE, ESPNIE,

View file

@ -140,6 +140,7 @@ from .medialaan import MedialaanIE
from .simplecast import SimplecastIE from .simplecast import SimplecastIE
from .wimtv import WimTVIE from .wimtv import WimTVIE
from .tvopengr import TVOpenGrEmbedIE from .tvopengr import TVOpenGrEmbedIE
from .ertgr import ERTWebtvEmbedIE
from .tvp import TVPEmbedIE from .tvp import TVPEmbedIE
from .blogger import BloggerIE from .blogger import BloggerIE
from .mainstreaming import MainStreamingIE from .mainstreaming import MainStreamingIE
@ -1923,6 +1924,15 @@ class GenericIE(InfoExtractor):
}, },
}] }]
}, },
{
'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/',
'info_dict': {
'id': '2022/tv/news-themata-ianouarios/20220114-apotis6-gouales-pita.mp4',
'ext': 'mp4',
'title': 'md5:df64f5b61c06d0e9556c0cdd5cf14464',
'thumbnail': 'https://www.ert.gr/themata/photos/2021/20220114-apotis6-gouales-pita.jpg',
},
},
{ {
# ThePlatform embedded with whitespaces in URLs # ThePlatform embedded with whitespaces in URLs
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
@ -3693,6 +3703,13 @@ class GenericIE(InfoExtractor):
if tvopengr_urls: if tvopengr_urls:
return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key())
# Look for ert.gr webtv embeds
ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage))
if len(ertwebtv_urls) == 1:
return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True)
elif ertwebtv_urls:
return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key())
tvp_urls = TVPEmbedIE._extract_urls(webpage) tvp_urls = TVPEmbedIE._extract_urls(webpage)
if tvp_urls: if tvp_urls:
return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())