[duboku] add playlist extractor

2020-08-29 15:04:16 +08:00 · 2020-08-29 15:04:16 +08:00 · de4144a4ae
commit de4144a4ae
parent 503406d4bc
2 changed files with 97 additions and 1 deletions
--- a/youtube_dl/extractor/duboku.py
+++ b/youtube_dl/extractor/duboku.py
@ -4,10 +4,49 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import *


+def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+    """Return the content of the tag with the specified attribute in the passed HTML document"""
+
+    if tag is None:
+        tag = '[a-zA-Z0-9:._-]+'
+    if attribute is None:
+        attribute = ''
+    else:
+        attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
+    if value is None:
+        value = ''
+    else:
+        value = re.escape(value) if escape_value else value
+        value = '=[\'"]?(?P<value>%s)[\'"]?' % value
+
+    retlist = []
+    for m in re.finditer(r'''(?xs)
+        <(?P<tag>%s)
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+         %s%s
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+        \s*>
+        (?P<content>.*?)
+        </\1>
+    ''' % (tag, attribute, value), html):
+        retlist.append(m)
+
+    return retlist
+
+
+def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+    retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
+    return retval[0] if retval else None
+
+
 class DubokuIE(InfoExtractor):
+    IE_NAME = 'duboku'
+    IE_DESC = 'www.duboku.co'
+
    _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9\-]+)\.html.*'
    _TESTS = [{
        'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
@ -90,3 +129,57 @@ class DubokuIE(InfoExtractor):
            'episode_id': episode_id,
            'formats': formats,
        }
+
+
+class DubokuPlaylistIE(InfoExtractor):
+    IE_NAME = 'duboku:list'
+    IE_DESC = 'www.duboku.co entire series'
+
+    _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError('Invalid URL: %s' % url)
+        series_id = mobj.group('id')
+        fragment = compat_urlparse.urlparse(url).fragment
+
+        webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
+        webpage_html = self._download_webpage(webpage_url, series_id)
+
+        # extract title
+
+        title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
+        title = unescapeHTML(title.group('content')) if title else None
+        if not title:
+            title = self._html_search_meta('keywords', webpage_html)
+        if not title:
+            title = _get_element_by_tag_and_attrib(webpage_html, 'title')
+            title = unescapeHTML(title.group('content')) if title else None
+
+        # extract playlists
+
+        playlists = {}
+        for div in _get_elements_by_tag_and_attrib(
+                webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
+            playlist_id = div.group('value')
+            playlist = []
+            for a in _get_elements_by_tag_and_attrib(
+                    div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
+                playlist.append({
+                    'href': unescapeHTML(a.group('value')),
+                    'title': unescapeHTML(a.group('content'))
+                })
+            playlists[playlist_id] = playlist
+
+        # select the specified playlist if url fragment exists
+        playlist = playlists.get(fragment) if fragment else next(iter(playlists.values()))
+        if not playlist:
+            raise ExtractorError(
+                'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
+
+        # return url results
+        return self.playlist_result([
+            self.url_result(
+                'https://www.duboku.co' + x['href'], video_title=x.get('title'))
+            for x in playlist], series_id, title)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -282,7 +282,10 @@ from .drtv import (
 )
 from .dtube import DTubeIE
 from .dvtv import DVTVIE
-from .duboku import DubokuIE
+from .duboku import (
+    DubokuIE,
+    DubokuPlaylistIE
+)
 from .dumpert import DumpertIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE