[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles and extended to handle subtitle tracks instead of skipping them; a wrapper with the old name is provided for compatibility. _parse_m3u8_formats is likewise renamed and extended, but without adding the compatibility wrapper; the test suite is adjusted to test the enhanced method instead.
2016-11-07 15:45:42 +01:00 · 2016-11-07 15:45:42 +01:00 · a0c3b2d5cf
commit a0c3b2d5cf
parent 19bb39202d
2 changed files with 43 additions and 18 deletions
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@ -684,17 +684,19 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                    'width': 1920,
                    'height': 1080,
                    'vcodec': 'avc1.64002a',
-                }]
+                }],
+                {}
            ),
        ]

-        for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
+        for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
            with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
                         mode='r', encoding='utf-8') as f:
-                formats = self.ie._parse_m3u8_formats(
+                formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
                    f.read(), m3u8_url, ext='mp4')
                self.ie._sort_formats(formats)
                expect_value(self, formats, expected_formats, None)
+                expect_value(self, subs, expected_subs, None)

    def test_parse_mpd_formats(self):
        _TEST_CASES = [
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -1879,11 +1879,21 @@ class InfoExtractor(object):
            'format_note': 'Quality selection URL',
        }

-    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
-                              entry_protocol='m3u8', preference=None, quality=None,
-                              m3u8_id=None, note=None, errnote=None,
-                              fatal=True, live=False, data=None, headers={},
+    def _extract_m3u8_formats(self, *args, **kwargs):
+        fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
+        if subs:
+            self.report_warning(bug_reports_message(
+                "Ignoring subtitle tracks found in the HLS manifest; "
+                "if any subtitle tracks are missing,"
+            ))
+        return fmts
+
+    def _extract_m3u8_formats_and_subtitles(
+            self, m3u8_url, video_id, ext=None, entry_protocol='m3u8',
+            preference=None, quality=None, m3u8_id=None, note=None,
+            errnote=None, fatal=True, live=False, data=None, headers={},
            query={}):
+
        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note=note or 'Downloading m3u8 information',
@ -1891,30 +1901,34 @@ class InfoExtractor(object):
            fatal=fatal, data=data, headers=headers, query=query)

        if res is False:
-            return []
+            return [], {}

        m3u8_doc, urlh = res
        m3u8_url = urlh.geturl()

-        return self._parse_m3u8_formats(
+        return self._parse_m3u8_formats_and_subtitles(
            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
            preference=preference, quality=quality, m3u8_id=m3u8_id,
            note=note, errnote=errnote, fatal=fatal, live=live, data=data,
            headers=headers, query=query, video_id=video_id)

-    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
-                            entry_protocol='m3u8', preference=None, quality=None,
-                            m3u8_id=None, live=False, note=None, errnote=None,
-                            fatal=True, data=None, headers={}, query={}, video_id=None):
+    def _parse_m3u8_formats_and_subtitles(
+            self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8',
+            preference=None, quality=None, m3u8_id=None, live=False, note=None,
+            errnote=None, fatal=True, data=None, headers={}, query={},
+            video_id=None):
+
        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
-            return []
+            return [], {}

        if (not self._downloader.params.get('allow_unplayable_formats')
                and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
-            return []
+            return [], {}

        formats = []

+        subtitles = {}
+
        format_url = lambda u: (
            u
            if re.match(r'^https?://', u)
@ -2001,7 +2015,7 @@ class InfoExtractor(object):
                }
                formats.append(f)

-            return formats
+            return formats, subtitles

        groups = {}
        last_stream_inf = {}
@ -2013,6 +2027,15 @@ class InfoExtractor(object):
            if not (media_type and group_id and name):
                return
            groups.setdefault(group_id, []).append(media)
+            # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
+            if media_type == 'SUBTITLES':
+                lang = media['LANGUAGE']  # XXX: normalise?
+                url = format_url(media['URI'])
+                sub_info = {
+                    'url': url,
+                    'ext': determine_ext(url),
+                }
+                subtitles.setdefault(lang, []).append(sub_info)
            if media_type not in ('VIDEO', 'AUDIO'):
                return
            media_url = media.get('URI')
@ -2160,7 +2183,7 @@ class InfoExtractor(object):
                        formats.append(http_f)

                last_stream_inf = {}
-        return formats
+        return formats, subtitles

    @staticmethod
    def _xpath_ns(path, namespace=None):