[ie/ARD] Overhaul extractors (#8878)
Closes #8731, Closes #6784, Closes #2366, Closes #2975, Closes #8760 Authored by: seproDev
This commit is contained in:
parent
225cf2b830
commit
5f009a094f
2 changed files with 250 additions and 375 deletions
|
@ -134,8 +134,8 @@ from .arcpublishing import ArcPublishingIE
|
||||||
from .arkena import ArkenaIE
|
from .arkena import ArkenaIE
|
||||||
from .ard import (
|
from .ard import (
|
||||||
ARDBetaMediathekIE,
|
ARDBetaMediathekIE,
|
||||||
|
ARDMediathekCollectionIE,
|
||||||
ARDIE,
|
ARDIE,
|
||||||
ARDMediathekIE,
|
|
||||||
)
|
)
|
||||||
from .arte import (
|
from .arte import (
|
||||||
ArteTVIE,
|
ArteTVIE,
|
||||||
|
|
|
@ -1,24 +1,23 @@
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from .generic import GenericIE
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
OnDemandPagedList,
|
||||||
determine_ext,
|
determine_ext,
|
||||||
ExtractorError,
|
|
||||||
int_or_none,
|
int_or_none,
|
||||||
|
join_nonempty,
|
||||||
|
make_archive_id,
|
||||||
parse_duration,
|
parse_duration,
|
||||||
qualities,
|
parse_iso8601,
|
||||||
|
remove_start,
|
||||||
str_or_none,
|
str_or_none,
|
||||||
try_get,
|
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
unified_timestamp,
|
|
||||||
update_url,
|
|
||||||
update_url_query,
|
update_url_query,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
xpath_text,
|
xpath_text,
|
||||||
)
|
)
|
||||||
from ..compat import compat_etree_fromstring
|
from ..utils.traversal import traverse_obj
|
||||||
|
|
||||||
|
|
||||||
class ARDMediathekBaseIE(InfoExtractor):
|
class ARDMediathekBaseIE(InfoExtractor):
|
||||||
|
@ -61,45 +60,6 @@ class ARDMediathekBaseIE(InfoExtractor):
|
||||||
'subtitles': subtitles,
|
'subtitles': subtitles,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _ARD_extract_episode_info(self, title):
|
|
||||||
"""Try to extract season/episode data from the title."""
|
|
||||||
res = {}
|
|
||||||
if not title:
|
|
||||||
return res
|
|
||||||
|
|
||||||
for pattern in [
|
|
||||||
# Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
|
|
||||||
# from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
|
|
||||||
r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
|
|
||||||
# E.g.: title="Fritjof aus Norwegen (2) (AD)"
|
|
||||||
# from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
|
|
||||||
r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
|
|
||||||
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
|
|
||||||
# E.g.: title="Folge 25/42: Symmetrie"
|
|
||||||
# from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
|
|
||||||
# E.g.: title="Folge 1063 - Vertrauen"
|
|
||||||
# from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
|
|
||||||
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
|
|
||||||
]:
|
|
||||||
m = re.match(pattern, title)
|
|
||||||
if m:
|
|
||||||
groupdict = m.groupdict()
|
|
||||||
res['season_number'] = int_or_none(groupdict.get('season_number'))
|
|
||||||
res['episode_number'] = int_or_none(groupdict.get('episode_number'))
|
|
||||||
res['episode'] = str_or_none(groupdict.get('episode'))
|
|
||||||
# Build the episode title by removing numeric episode information:
|
|
||||||
if groupdict.get('ep_info') and not res['episode']:
|
|
||||||
res['episode'] = str_or_none(
|
|
||||||
title.replace(groupdict.get('ep_info'), ''))
|
|
||||||
if res['episode']:
|
|
||||||
res['episode'] = res['episode'].strip()
|
|
||||||
break
|
|
||||||
|
|
||||||
# As a fallback use the whole title as the episode name:
|
|
||||||
if not res.get('episode'):
|
|
||||||
res['episode'] = title.strip()
|
|
||||||
return res
|
|
||||||
|
|
||||||
def _extract_formats(self, media_info, video_id):
|
def _extract_formats(self, media_info, video_id):
|
||||||
type_ = media_info.get('_type')
|
type_ = media_info.get('_type')
|
||||||
media_array = media_info.get('_mediaArray', [])
|
media_array = media_info.get('_mediaArray', [])
|
||||||
|
@ -155,138 +115,6 @@ class ARDMediathekBaseIE(InfoExtractor):
|
||||||
return formats
|
return formats
|
||||||
|
|
||||||
|
|
||||||
class ARDMediathekIE(ARDMediathekBaseIE):
|
|
||||||
IE_NAME = 'ARD:mediathek'
|
|
||||||
_VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
|
|
||||||
|
|
||||||
_TESTS = [{
|
|
||||||
# available till 26.07.2022
|
|
||||||
'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
|
|
||||||
'info_dict': {
|
|
||||||
'id': '44726822',
|
|
||||||
'ext': 'mp4',
|
|
||||||
'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
|
|
||||||
'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
|
|
||||||
'duration': 1740,
|
|
||||||
},
|
|
||||||
'params': {
|
|
||||||
# m3u8 download
|
|
||||||
'skip_download': True,
|
|
||||||
}
|
|
||||||
}, {
|
|
||||||
'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
|
|
||||||
'only_matching': True,
|
|
||||||
}, {
|
|
||||||
# audio
|
|
||||||
'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
|
|
||||||
'only_matching': True,
|
|
||||||
}, {
|
|
||||||
'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
|
|
||||||
'only_matching': True,
|
|
||||||
}, {
|
|
||||||
# audio
|
|
||||||
'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
|
|
||||||
'only_matching': True,
|
|
||||||
}, {
|
|
||||||
'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
|
|
||||||
'only_matching': True,
|
|
||||||
}]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def suitable(cls, url):
|
|
||||||
return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
|
||||||
# determine video id from url
|
|
||||||
m = self._match_valid_url(url)
|
|
||||||
|
|
||||||
document_id = None
|
|
||||||
|
|
||||||
numid = re.search(r'documentId=([0-9]+)', url)
|
|
||||||
if numid:
|
|
||||||
document_id = video_id = numid.group(1)
|
|
||||||
else:
|
|
||||||
video_id = m.group('video_id')
|
|
||||||
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
|
||||||
|
|
||||||
ERRORS = (
|
|
||||||
('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
|
|
||||||
('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
|
|
||||||
'Video %s is no longer available'),
|
|
||||||
)
|
|
||||||
|
|
||||||
for pattern, message in ERRORS:
|
|
||||||
if pattern in webpage:
|
|
||||||
raise ExtractorError(message % video_id, expected=True)
|
|
||||||
|
|
||||||
if re.search(r'[\?&]rss($|[=&])', url):
|
|
||||||
doc = compat_etree_fromstring(webpage.encode('utf-8'))
|
|
||||||
if doc.tag == 'rss':
|
|
||||||
return GenericIE()._extract_rss(url, video_id, doc)
|
|
||||||
|
|
||||||
title = self._og_search_title(webpage, default=None) or self._html_search_regex(
|
|
||||||
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
|
|
||||||
r'<meta name="dcterms\.title" content="(.*?)"/>',
|
|
||||||
r'<h4 class="headline">(.*?)</h4>',
|
|
||||||
r'<title[^>]*>(.*?)</title>'],
|
|
||||||
webpage, 'title')
|
|
||||||
description = self._og_search_description(webpage, default=None) or self._html_search_meta(
|
|
||||||
'dcterms.abstract', webpage, 'description', default=None)
|
|
||||||
if description is None:
|
|
||||||
description = self._html_search_meta(
|
|
||||||
'description', webpage, 'meta description', default=None)
|
|
||||||
if description is None:
|
|
||||||
description = self._html_search_regex(
|
|
||||||
r'<p\s+class="teasertext">(.+?)</p>',
|
|
||||||
webpage, 'teaser text', default=None)
|
|
||||||
|
|
||||||
# Thumbnail is sometimes not present.
|
|
||||||
# It is in the mobile version, but that seems to use a different URL
|
|
||||||
# structure altogether.
|
|
||||||
thumbnail = self._og_search_thumbnail(webpage, default=None)
|
|
||||||
|
|
||||||
media_streams = re.findall(r'''(?x)
|
|
||||||
mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
|
|
||||||
"([^"]+)"''', webpage)
|
|
||||||
|
|
||||||
if media_streams:
|
|
||||||
QUALITIES = qualities(['lo', 'hi', 'hq'])
|
|
||||||
formats = []
|
|
||||||
for furl in set(media_streams):
|
|
||||||
if furl.endswith('.f4m'):
|
|
||||||
fid = 'f4m'
|
|
||||||
else:
|
|
||||||
fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
|
|
||||||
fid = fid_m.group(1) if fid_m else None
|
|
||||||
formats.append({
|
|
||||||
'quality': QUALITIES(fid),
|
|
||||||
'format_id': fid,
|
|
||||||
'url': furl,
|
|
||||||
})
|
|
||||||
info = {
|
|
||||||
'formats': formats,
|
|
||||||
}
|
|
||||||
else: # request JSON file
|
|
||||||
if not document_id:
|
|
||||||
video_id = self._search_regex(
|
|
||||||
(r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
|
|
||||||
webpage, 'media id', default=None)
|
|
||||||
info = self._extract_media_info(
|
|
||||||
'http://www.ardmediathek.de/play/media/%s' % video_id,
|
|
||||||
webpage, video_id)
|
|
||||||
|
|
||||||
info.update({
|
|
||||||
'id': video_id,
|
|
||||||
'title': title,
|
|
||||||
'description': description,
|
|
||||||
'thumbnail': thumbnail,
|
|
||||||
})
|
|
||||||
info.update(self._ARD_extract_episode_info(info['title']))
|
|
||||||
|
|
||||||
return info
|
|
||||||
|
|
||||||
|
|
||||||
class ARDIE(InfoExtractor):
|
class ARDIE(InfoExtractor):
|
||||||
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
|
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
|
@ -399,21 +227,23 @@ class ARDIE(InfoExtractor):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
class ARDBetaMediathekIE(InfoExtractor):
|
||||||
|
IE_NAME = 'ARDMediathek'
|
||||||
_VALID_URL = r'''(?x)https://
|
_VALID_URL = r'''(?x)https://
|
||||||
(?:(?:beta|www)\.)?ardmediathek\.de/
|
(?:(?:beta|www)\.)?ardmediathek\.de/
|
||||||
(?:(?P<client>[^/]+)/)?
|
(?:[^/]+/)?
|
||||||
(?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
|
(?:player|live|video)/
|
||||||
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
|
(?:(?P<display_id>[^?#]+)/)?
|
||||||
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
|
(?P<id>[a-zA-Z0-9]+)
|
||||||
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
|
/?(?:[?#]|$)'''
|
||||||
|
_GEO_COUNTRIES = ['DE']
|
||||||
|
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
|
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
|
||||||
'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
|
'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
|
'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
|
||||||
'id': '12939099',
|
'id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
|
||||||
'title': 'Liebe auf vier Pfoten',
|
'title': 'Liebe auf vier Pfoten',
|
||||||
'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
|
'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
|
||||||
'duration': 5222,
|
'duration': 5222,
|
||||||
|
@ -422,7 +252,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
||||||
'upload_date': '20231130',
|
'upload_date': '20231130',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'episode': 'Liebe auf vier Pfoten',
|
'episode': 'Liebe auf vier Pfoten',
|
||||||
'series': 'Filme im MDR'
|
'series': 'Filme im MDR',
|
||||||
|
'age_limit': 0,
|
||||||
|
'channel': 'MDR',
|
||||||
|
'_old_archive_ids': ['ardbetamediathek 12939099'],
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
|
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
|
||||||
|
@ -443,7 +276,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
||||||
'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
|
'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
|
||||||
'md5': '1e73ded21cb79bac065117e80c81dc88',
|
'md5': '1e73ded21cb79bac065117e80c81dc88',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '10049223',
|
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': 'tagesschau, 20:00 Uhr',
|
'title': 'tagesschau, 20:00 Uhr',
|
||||||
'timestamp': 1636398000,
|
'timestamp': 1636398000,
|
||||||
|
@ -454,6 +287,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
||||||
'episode': 'tagesschau, 20:00 Uhr',
|
'episode': 'tagesschau, 20:00 Uhr',
|
||||||
'series': 'tagesschau',
|
'series': 'tagesschau',
|
||||||
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
|
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
|
||||||
|
'channel': 'ARD-Aktuell',
|
||||||
|
'_old_archive_ids': ['ardbetamediathek 10049223'],
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
|
||||||
|
'md5': 'c428b9effff18ff624d4f903bda26315',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'duration': 2700,
|
||||||
|
'episode': '7 Tage ... unter harten Jungs',
|
||||||
|
'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
|
||||||
|
'upload_date': '20231005',
|
||||||
|
'timestamp': 1696491171,
|
||||||
|
'display_id': '7-tage/7-tage-unter-harten-jungs/hr-fernsehen',
|
||||||
|
'series': '7 Tage ...',
|
||||||
|
'channel': 'HR',
|
||||||
|
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
|
||||||
|
'title': '7 Tage ... unter harten Jungs',
|
||||||
|
'_old_archive_ids': ['ardbetamediathek 94834686'],
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
|
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
|
||||||
|
@ -470,6 +323,176 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
|
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
|
}, {
|
||||||
|
'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
|
||||||
|
'only_matching': True,
|
||||||
|
}]
|
||||||
|
|
||||||
|
def _extract_episode_info(self, title):
|
||||||
|
patterns = [
|
||||||
|
# Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
|
||||||
|
# from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
|
||||||
|
r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
|
||||||
|
# E.g.: title="Fritjof aus Norwegen (2) (AD)"
|
||||||
|
# from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
|
||||||
|
r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
|
||||||
|
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
|
||||||
|
# E.g.: title="Folge 25/42: Symmetrie"
|
||||||
|
# from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
|
||||||
|
# E.g.: title="Folge 1063 - Vertrauen"
|
||||||
|
# from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
|
||||||
|
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
|
||||||
|
# As a fallback use the full title
|
||||||
|
r'(?P<title>.*)',
|
||||||
|
]
|
||||||
|
|
||||||
|
return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
|
||||||
|
'season_number': ('season_number', {int_or_none}),
|
||||||
|
'episode_number': ('episode_number', {int_or_none}),
|
||||||
|
'episode': ((
|
||||||
|
('episode', {str_or_none}),
|
||||||
|
('ep_info', {lambda x: title.replace(x, '')}),
|
||||||
|
('title', {str}),
|
||||||
|
), {str.strip}),
|
||||||
|
}), get_all=False)
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
|
||||||
|
|
||||||
|
page_data = self._download_json(
|
||||||
|
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}', video_id, query={
|
||||||
|
'embedded': 'false',
|
||||||
|
'mcV6': 'true',
|
||||||
|
})
|
||||||
|
|
||||||
|
player_data = traverse_obj(
|
||||||
|
page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False)
|
||||||
|
is_live = player_data.get('type') == 'player_live'
|
||||||
|
media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
|
||||||
|
|
||||||
|
if player_data.get('blockedByFsk'):
|
||||||
|
self.raise_no_formats('This video is only available after 22:00', expected=True)
|
||||||
|
|
||||||
|
formats = []
|
||||||
|
subtitles = {}
|
||||||
|
for stream in traverse_obj(media_data, ('streams', ..., {dict})):
|
||||||
|
kind = stream.get('kind')
|
||||||
|
# Prioritize main stream over sign language and others
|
||||||
|
preference = 1 if kind == 'main' else None
|
||||||
|
for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))):
|
||||||
|
media_url = media['url']
|
||||||
|
|
||||||
|
audio_kind = traverse_obj(media, (
|
||||||
|
'audios', 0, 'kind', {str}), default='').replace('standard', '')
|
||||||
|
lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu'
|
||||||
|
lang = join_nonempty(lang_code, audio_kind)
|
||||||
|
language_preference = 10 if lang == 'deu' else -10
|
||||||
|
|
||||||
|
if determine_ext(media_url) == 'm3u8':
|
||||||
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
|
media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live)
|
||||||
|
for f in fmts:
|
||||||
|
f['language'] = lang
|
||||||
|
f['language_preference'] = language_preference
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
else:
|
||||||
|
formats.append({
|
||||||
|
'url': media_url,
|
||||||
|
'format_id': f'http-{kind}',
|
||||||
|
'preference': preference,
|
||||||
|
'language': lang,
|
||||||
|
'language_preference': language_preference,
|
||||||
|
**traverse_obj(media, {
|
||||||
|
'format_note': ('forcedLabel', {str}),
|
||||||
|
'width': ('maxHResolutionPx', {int_or_none}),
|
||||||
|
'height': ('maxVResolutionPx', {int_or_none}),
|
||||||
|
'vcodec': ('videoCodec', {str}),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
for sub in traverse_obj(media_data, ('subtitles', ..., {dict})):
|
||||||
|
for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))):
|
||||||
|
subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({
|
||||||
|
'url': sources['url'],
|
||||||
|
'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')),
|
||||||
|
})
|
||||||
|
|
||||||
|
age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none}))
|
||||||
|
old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId'))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'display_id': display_id,
|
||||||
|
'formats': formats,
|
||||||
|
'subtitles': subtitles,
|
||||||
|
'is_live': is_live,
|
||||||
|
'age_limit': age_limit,
|
||||||
|
**traverse_obj(media_data, ('meta', {
|
||||||
|
'title': 'title',
|
||||||
|
'description': 'synopsis',
|
||||||
|
'timestamp': ('broadcastedOnDateTime', {parse_iso8601}),
|
||||||
|
'series': 'seriesTitle',
|
||||||
|
'thumbnail': ('images', 0, 'url', {url_or_none}),
|
||||||
|
'duration': ('durationSeconds', {int_or_none}),
|
||||||
|
'channel': 'clipSourceName',
|
||||||
|
})),
|
||||||
|
**self._extract_episode_info(page_data.get('title')),
|
||||||
|
'_old_archive_ids': [make_archive_id(ARDBetaMediathekIE, old_id)],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ARDMediathekCollectionIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'''(?x)https://
|
||||||
|
(?:(?:beta|www)\.)?ardmediathek\.de/
|
||||||
|
(?:[^/?#]+/)?
|
||||||
|
(?P<playlist>sendung|serie|sammlung)/
|
||||||
|
(?:(?P<display_id>[^?#]+?)/)?
|
||||||
|
(?P<id>[a-zA-Z0-9]+)
|
||||||
|
(?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)'''
|
||||||
|
_GEO_COUNTRIES = ['DE']
|
||||||
|
|
||||||
|
_TESTS = [{
|
||||||
|
'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV',
|
||||||
|
'display_id': 'quiz/staffel-1-originalversion',
|
||||||
|
'title': 'Staffel 1 Originalversion',
|
||||||
|
},
|
||||||
|
'playlist_count': 3,
|
||||||
|
}, {
|
||||||
|
'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD',
|
||||||
|
'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription',
|
||||||
|
'title': 'Staffel 4 mit Audiodeskription',
|
||||||
|
},
|
||||||
|
'playlist_count': 12,
|
||||||
|
}, {
|
||||||
|
'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1',
|
||||||
|
'display_id': 'babylon-berlin/staffel-1',
|
||||||
|
'title': 'Staffel 1',
|
||||||
|
},
|
||||||
|
'playlist_count': 8,
|
||||||
|
}, {
|
||||||
|
'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
|
||||||
|
'display_id': 'tatort',
|
||||||
|
'title': 'Tatort',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 500,
|
||||||
|
}, {
|
||||||
|
'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '5eOHzt8XB2sqeFXbIoJlg2',
|
||||||
|
'display_id': 'die-kirche-bleibt-im-dorf',
|
||||||
|
'title': 'Die Kirche bleibt im Dorf',
|
||||||
|
'description': 'Die Kirche bleibt im Dorf',
|
||||||
|
},
|
||||||
|
'playlist_count': 4,
|
||||||
}, {
|
}, {
|
||||||
# playlist of type 'sendung'
|
# playlist of type 'sendung'
|
||||||
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
|
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
|
||||||
|
@ -482,196 +505,48 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
||||||
# playlist of type 'sammlung'
|
# playlist of type 'sammlung'
|
||||||
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
|
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
}, {
|
|
||||||
'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
|
|
||||||
'only_matching': True,
|
|
||||||
}, {
|
|
||||||
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
|
|
||||||
'only_matching': True,
|
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
|
_PAGE_SIZE = 100
|
||||||
""" Query the ARD server for playlist information
|
|
||||||
and returns the data in "raw" format """
|
|
||||||
assert mode in ('sendung', 'serie', 'sammlung')
|
|
||||||
if mode in ('sendung', 'serie'):
|
|
||||||
graphQL = json.dumps({
|
|
||||||
'query': '''{
|
|
||||||
showPage(
|
|
||||||
client: "%s"
|
|
||||||
showId: "%s"
|
|
||||||
pageNumber: %d
|
|
||||||
) {
|
|
||||||
pagination {
|
|
||||||
pageSize
|
|
||||||
totalElements
|
|
||||||
}
|
|
||||||
teasers { # Array
|
|
||||||
mediumTitle
|
|
||||||
links { target { id href title } }
|
|
||||||
type
|
|
||||||
}
|
|
||||||
}}''' % (client, playlist_id, page_number),
|
|
||||||
}).encode()
|
|
||||||
else: # mode == 'sammlung'
|
|
||||||
graphQL = json.dumps({
|
|
||||||
'query': '''{
|
|
||||||
morePage(
|
|
||||||
client: "%s"
|
|
||||||
compilationId: "%s"
|
|
||||||
pageNumber: %d
|
|
||||||
) {
|
|
||||||
widget {
|
|
||||||
pagination {
|
|
||||||
pageSize
|
|
||||||
totalElements
|
|
||||||
}
|
|
||||||
teasers { # Array
|
|
||||||
mediumTitle
|
|
||||||
links { target { id href title } }
|
|
||||||
type
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}}''' % (client, playlist_id, page_number),
|
|
||||||
}).encode()
|
|
||||||
# Ressources for ARD graphQL debugging:
|
|
||||||
# https://api-test.ardmediathek.de/public-gateway
|
|
||||||
show_page = self._download_json(
|
|
||||||
'https://api.ardmediathek.de/public-gateway',
|
|
||||||
'[Playlist] %s' % display_id,
|
|
||||||
data=graphQL,
|
|
||||||
headers={'Content-Type': 'application/json'})['data']
|
|
||||||
# align the structure of the returned data:
|
|
||||||
if mode in ('sendung', 'serie'):
|
|
||||||
show_page = show_page['showPage']
|
|
||||||
else: # mode == 'sammlung'
|
|
||||||
show_page = show_page['morePage']['widget']
|
|
||||||
return show_page
|
|
||||||
|
|
||||||
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
|
|
||||||
""" Collects all playlist entries and returns them as info dict.
|
|
||||||
Supports playlists of mode 'sendung', 'serie', and 'sammlung',
|
|
||||||
as well as nested playlists. """
|
|
||||||
entries = []
|
|
||||||
pageNumber = 0
|
|
||||||
while True: # iterate by pageNumber
|
|
||||||
show_page = self._ARD_load_playlist_snippet(
|
|
||||||
playlist_id, display_id, client, mode, pageNumber)
|
|
||||||
for teaser in show_page['teasers']: # process playlist items
|
|
||||||
if '/compilation/' in teaser['links']['target']['href']:
|
|
||||||
# alternativ cond.: teaser['type'] == "compilation"
|
|
||||||
# => This is an nested compilation, e.g. like:
|
|
||||||
# https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
|
|
||||||
link_mode = 'sammlung'
|
|
||||||
else:
|
|
||||||
link_mode = 'video'
|
|
||||||
|
|
||||||
item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
|
|
||||||
client, link_mode, display_id,
|
|
||||||
# perform HTLM quoting of episode title similar to ARD:
|
|
||||||
re.sub('^-|-$', '', # remove '-' from begin/end
|
|
||||||
re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
|
|
||||||
teaser['links']['target']['title'].lower()
|
|
||||||
.replace('ä', 'ae').replace('ö', 'oe')
|
|
||||||
.replace('ü', 'ue').replace('ß', 'ss'))),
|
|
||||||
teaser['links']['target']['id'])
|
|
||||||
entries.append(self.url_result(
|
|
||||||
item_url,
|
|
||||||
ie=ARDBetaMediathekIE.ie_key()))
|
|
||||||
|
|
||||||
if (show_page['pagination']['pageSize'] * (pageNumber + 1)
|
|
||||||
>= show_page['pagination']['totalElements']):
|
|
||||||
# we've processed enough pages to get all playlist entries
|
|
||||||
break
|
|
||||||
pageNumber = pageNumber + 1
|
|
||||||
|
|
||||||
return self.playlist_result(entries, playlist_id, playlist_title=display_id)
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
|
playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group(
|
||||||
'id', 'display_id', 'playlist', 'client', 'season')
|
'id', 'display_id', 'playlist', 'season', 'version')
|
||||||
display_id, client = display_id or video_id, client or 'ard'
|
|
||||||
|
|
||||||
if playlist_type:
|
def call_api(page_num):
|
||||||
# TODO: Extract only specified season
|
api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset'
|
||||||
return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
|
return self._download_json(
|
||||||
|
f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id,
|
||||||
|
f'Downloading playlist page {page_num}', query={
|
||||||
|
'pageNumber': page_num,
|
||||||
|
'pageSize': self._PAGE_SIZE,
|
||||||
|
**({
|
||||||
|
'seasoned': 'true',
|
||||||
|
'seasonNumber': season_number,
|
||||||
|
'withOriginalversion': 'true' if version == 'OV' else 'false',
|
||||||
|
'withAudiodescription': 'true' if version == 'AD' else 'false',
|
||||||
|
} if season_number else {}),
|
||||||
|
})
|
||||||
|
|
||||||
player_page = self._download_json(
|
def fetch_page(page_num):
|
||||||
'https://api.ardmediathek.de/public-gateway',
|
for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})):
|
||||||
display_id, data=json.dumps({
|
item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False)
|
||||||
'query': '''{
|
if not item_id or item_id == playlist_id:
|
||||||
playerPage(client:"%s", clipId: "%s") {
|
continue
|
||||||
blockedByFsk
|
item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video'
|
||||||
broadcastedOn
|
yield self.url_result(
|
||||||
maturityContentRating
|
f'https://www.ardmediathek.de/{item_mode}/{item_id}',
|
||||||
mediaCollection {
|
ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE),
|
||||||
_duration
|
**traverse_obj(item, {
|
||||||
_geoblocked
|
'id': ('id', {str}),
|
||||||
_isLive
|
'title': ('longTitle', {str}),
|
||||||
_mediaArray {
|
'duration': ('duration', {int_or_none}),
|
||||||
_mediaStreamArray {
|
'timestamp': ('broadcastedOn', {parse_iso8601}),
|
||||||
_quality
|
}))
|
||||||
_server
|
|
||||||
_stream
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_previewImage
|
|
||||||
_subtitleUrl
|
|
||||||
_type
|
|
||||||
}
|
|
||||||
show {
|
|
||||||
title
|
|
||||||
}
|
|
||||||
image {
|
|
||||||
src
|
|
||||||
}
|
|
||||||
synopsis
|
|
||||||
title
|
|
||||||
tracking {
|
|
||||||
atiCustomVars {
|
|
||||||
contentId
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}''' % (client, video_id),
|
|
||||||
}).encode(), headers={
|
|
||||||
'Content-Type': 'application/json'
|
|
||||||
})['data']['playerPage']
|
|
||||||
title = player_page['title']
|
|
||||||
content_id = str_or_none(try_get(
|
|
||||||
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
|
|
||||||
media_collection = player_page.get('mediaCollection') or {}
|
|
||||||
if not media_collection and content_id:
|
|
||||||
media_collection = self._download_json(
|
|
||||||
'https://www.ardmediathek.de/play/media/' + content_id,
|
|
||||||
content_id, fatal=False) or {}
|
|
||||||
info = self._parse_media_info(
|
|
||||||
media_collection, content_id or video_id,
|
|
||||||
player_page.get('blockedByFsk'))
|
|
||||||
age_limit = None
|
|
||||||
description = player_page.get('synopsis')
|
|
||||||
maturity_content_rating = player_page.get('maturityContentRating')
|
|
||||||
if maturity_content_rating:
|
|
||||||
age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
|
|
||||||
if not age_limit and description:
|
|
||||||
age_limit = int_or_none(self._search_regex(
|
|
||||||
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
|
|
||||||
info.update({
|
|
||||||
'age_limit': age_limit,
|
|
||||||
'display_id': display_id,
|
|
||||||
'title': title,
|
|
||||||
'description': description,
|
|
||||||
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
|
|
||||||
'series': try_get(player_page, lambda x: x['show']['title']),
|
|
||||||
'thumbnail': (media_collection.get('_previewImage')
|
|
||||||
or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
|
|
||||||
or self.get_thumbnail_from_html(display_id, url)),
|
|
||||||
})
|
|
||||||
info.update(self._ARD_extract_episode_info(info['title']))
|
|
||||||
return info
|
|
||||||
|
|
||||||
def get_thumbnail_from_html(self, display_id, url):
|
page_data = call_api(0)
|
||||||
webpage = self._download_webpage(url, display_id, fatal=False) or ''
|
full_id = join_nonempty(playlist_id, season_number, version, delim='_')
|
||||||
return (
|
|
||||||
self._og_search_thumbnail(webpage, default=None)
|
return self.playlist_result(
|
||||||
or self._html_search_meta('thumbnailUrl', webpage, default=None))
|
OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id,
|
||||||
|
title=page_data.get('title'), description=page_data.get('synopsis'))
|
||||||
|
|
Loading…
Reference in a new issue