[MainStreaming] Add extractor (#2180)

Closes #1183, https://github.com/ytdl-org/youtube-dl/issues/29615 Authored by: coletdjnz
2022-01-05 08:48:17 +00:00 · 2022-01-05 08:48:17 +00:00 · 9c634ef857
commit 9c634ef857
parent 9f517bb1f3
3 changed files with 240 additions and 1 deletions
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -756,6 +756,7 @@ from .mailru import (
    MailRuMusicIE,
    MailRuMusicSearchIE,
 )
+from .mainstreaming import MainStreamingIE
 from .malltv import MallTVIE
 from .mangomolo import (
    MangomoloVideoIE,
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -137,6 +137,7 @@ from .simplecast import SimplecastIE
 from .wimtv import WimTVIE
 from .tvp import TVPEmbedIE
 from .blogger import BloggerIE
+from .mainstreaming import MainStreamingIE
 from .gfycat import GfycatIE


@ -2384,6 +2385,19 @@ class GenericIE(InfoExtractor):
                'upload_date': '20211113'
            }
        },
+        {
+            # MainStreaming player
+            'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/',
+            'info_dict': {
+                'id': 'EUlZfGWkGpOd',
+                'title': 'La Settimana ',
+                'description': '03 Ottobre ore 02:00',
+                'ext': 'mp4',
+                'live_status': 'not_live',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+                'duration': 1512
+            }
+        },
        {
            # Multiple gfycat iframe embeds
            'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422',
@ -2411,7 +2425,6 @@ class GenericIE(InfoExtractor):
            },
            'playlist_count': 9
        }
-        #
    ]

    def report_following_redirect(self, new_url):
@ -3600,10 +3613,16 @@ class GenericIE(InfoExtractor):
        if tvp_urls:
            return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())

+        # Look for MainStreaming embeds
+        mainstreaming_urls = MainStreamingIE._extract_urls(webpage)
+        if mainstreaming_urls:
+            return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key())
+
        # Look for Gfycat Embeds
        gfycat_urls = GfycatIE._extract_urls(webpage)
        if gfycat_urls:
            return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key())
+
        # Look for HTML5 media
        entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
        if entries:
--- a/yt_dlp/extractor/mainstreaming.py
+++ b/yt_dlp/extractor/mainstreaming.py
@ -0,0 +1,219 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    parse_duration,
+    traverse_obj,
+    try_get,
+    urljoin
+)
+
+
+class MainStreamingIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
+    IE_DESC = 'MainStreaming Player'
+
+    _TESTS = [
+        {
+            # Live stream offline, has alternative content id
+            'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
+            'info_dict': {
+                'id': '53EN6GxbWaJC',
+                'title': 'Diretta homepage 2021-12-31 12:00',
+                'description': '',
+                'live_status': 'was_live',
+                'ext': 'mp4',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+            },
+            'expected_warnings': [
+                'Ignoring alternative content ID: WDAF1KOWUpH3',
+                'MainStreaming said: Live event is OFFLINE'
+            ],
+            'skip': 'live stream offline'
+        }, {
+            # playlist
+            'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
+            'info_dict': {
+                'id': 'WDAF1KOWUpH3',
+                'title': 'Playlist homepage',
+            },
+            'playlist_mincount': 2
+        }, {
+            # livestream
+            'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
+            'info_dict': {
+                'id': 'tDoFkZD3T1Lw',
+                'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+                'live_status': 'is_live',
+                'ext': 'mp4',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+            },
+            'skip': 'live stream'
+        }, {
+            'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
+            'info_dict': {
+                'id': 'EUlZfGWkGpOd',
+                'title': 'La Settimana ',
+                'description': '03 Ottobre ore 02:00',
+                'ext': 'mp4',
+                'live_status': 'not_live',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+                'duration': 1512
+            }
+        }, {
+            # video without webtools- prefix
+            'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
+            'info_dict': {
+                'id': 'MfuWmzL2lGkA',
+                'title': 'TG Mattina',
+                'description': '06 Ottobre ore 08:00',
+                'ext': 'mp4',
+                'live_status': 'not_live',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+                'duration': 789.04
+            }
+        }, {
+            # always-on livestream with DVR
+            'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
+            'info_dict': {
+                'id': 'HVvPMzy',
+                'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+                'description': 'canale all news',
+                'live_status': 'is_live',
+                'ext': 'mp4',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        }, {
+            # no host
+            'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
+            'only_matching': True
+        }, {
+            'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
+            'only_matching': True
+        }, {
+            'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
+            'only_matching': True
+        }
+    ]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        mobj = re.findall(
+            r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
+        if mobj:
+            return [group[0] for group in mobj]
+
+    def _playlist_entries(self, host, playlist_content):
+        for entry in playlist_content:
+            content_id = entry.get('contentID')
+            yield {
+                '_type': 'url',
+                'ie_key': MainStreamingIE.ie_key(),
+                'id': content_id,
+                'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
+                'title': entry.get('title'),
+                'url': f'https://{host}/embed/{content_id}'
+            }
+
+    @staticmethod
+    def _get_webtools_host(host):
+        if not host.startswith('webtools'):
+            host = 'webtools' + ('-' if not host.startswith('.') else '') + host
+        return host
+
+    def _get_webtools_base_url(self, host):
+        return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
+
+    def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
+        # JSON API, does not appear to be documented
+        return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
+
+    def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
+        # webtools docs: https://webtools.msvdn.net/
+        return self._download_json(
+            urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
+
+    def _real_extract(self, url):
+        host, video_id = self._match_valid_url(url).groups()
+        content_info = try_get(
+            self._call_api(
+                host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
+        # Fallback
+        if not content_info:
+            webpage = self._download_webpage(url, video_id)
+            player_config = self._parse_json(
+                self._search_regex(
+                    r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
+                    default='{}', flags=re.DOTALL),
+                video_id, transform_source=js_to_json, fatal=False) or {}
+            content_info = player_config['contentInfo']
+
+        host = content_info.get('host') or host
+        video_id = content_info.get('contentID') or video_id
+        title = content_info.get('title')
+        description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
+        live_status = 'not_live'
+        if content_info.get('drmEnabled'):
+            self.report_drm(video_id)
+
+        alternative_content_id = content_info.get('alternativeContentID')
+        if alternative_content_id:
+            self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
+
+        content_type = int_or_none(content_info.get('contentType'))
+        format_base_url = None
+        formats = []
+        subtitles = {}
+        # Live content
+        if content_type == 20:
+            dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
+            format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
+            live_status = 'is_live'
+            heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
+            if heartbeat.get('heartBeatUp') is False:
+                self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
+                live_status = 'was_live'
+
+        # Playlist
+        elif content_type == 31:
+            return self.playlist_result(
+                self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
+        # Normal video content?
+        elif content_type == 10:
+            format_base_url = f'https://{host}/vod/{video_id}/%s'
+            # Progressive format
+            # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
+            # however it seems to be the same as original.mp4?
+            formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
+        else:
+            self.raise_no_formats(f'Unknown content type {content_type}')
+
+        if format_base_url:
+            m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+                format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
+            mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+                format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
+
+            subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
+            formats.extend(m3u8_formats + mpd_formats)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'live_status': live_status,
+            'duration': parse_duration(content_info.get('duration')),
+            'tags': content_info.get('tags'),
+            'subtitles': subtitles,
+            'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster')
+        }