[wwe] Fix issues, extract subtitles and add support for playlists (closes #14781, closes #17450)

This commit is contained in:
Sergey M․ 2018-11-17 23:59:20 +07:00
parent 11d19ff503
commit 006374e3ae
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -1,20 +1,75 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import urljoin from ..utils import (
try_get,
unescapeHTML,
url_or_none,
urljoin,
)
class WWEIE(InfoExtractor): class WWEBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?wwe.com/(?:.*/)?videos/(?P<id>[\w-]+)' _SUBTITLE_LANGS = {
'English': 'en',
'Deutsch': 'de',
}
def _extract_entry(self, data, url, video_id=None):
video_id = compat_str(video_id or data['nid'])
title = data['title']
formats = self._extract_m3u8_formats(
data['file'], video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
description = data.get('description')
thumbnail = urljoin(url, data.get('image'))
series = data.get('show_name')
episode = data.get('episode_name')
subtitles = {}
tracks = data.get('tracks')
if isinstance(tracks, list):
for track in tracks:
if not isinstance(track, dict):
continue
if track.get('kind') != 'captions':
continue
track_file = url_or_none(track.get('file'))
if not track_file:
continue
label = track.get('label')
lang = self._SUBTITLE_LANGS.get(label, label) or 'en'
subtitles.setdefault(lang, []).append({
'url': track_file,
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'series': series,
'episode': episode,
'formats': formats,
'subtitles': subtitles,
}
class WWEIE(WWEBaseIE):
_VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018',
'md5': '30cbc824b51f4010ea885bfcaec76972', 'md5': '92811c6a14bfc206f7a6a9c5d9140184',
'info_dict': { 'info_dict': {
'id': '40048199', 'id': '40048199',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018',
'description': 'Still fuming after he and his wife Brie Bella were attacked by The Miz and Maryse last week, Daniel Bryan takes care of some unfinished business with Andrade "Cien" Almas.', 'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
} }
}, { }, {
@ -26,31 +81,60 @@ class WWEIE(InfoExtractor):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
drupal_settings = self._parse_json( landing = self._parse_json(
self._html_search_regex( self._html_search_regex(
r'(?s)Drupal\.settings\s*,\s*({.+?})\);', r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;',
webpage, 'drupal settings'), webpage, 'drupal settings'),
display_id) display_id)['WWEVideoLanding']
player = drupal_settings['WWEVideoLanding']['initialVideo'] data = landing['initialVideo']['playlist'][0]
metadata = player['playlist'][0] video_id = landing.get('initialVideoId')
id = compat_str(metadata['nid']) info = self._extract_entry(data, url, video_id)
title = metadata.get('title') or self._og_search_title(webpage) info['display_id'] = display_id
video_url = 'https:' + metadata['file'] return info
thumbnail = None
if metadata.get('image') is not None:
thumbnail = urljoin(url, metadata.get('image'))
description = metadata.get('description')
formats = self._extract_m3u8_formats(video_url, id, 'mp4')
return { class WWEPlaylistIE(WWEBaseIE):
'id': id, _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
'title': title, _TESTS = [{
'formats': formats, 'url': 'https://www.wwe.com/shows/raw/2018-11-12',
'url': video_url, 'info_dict': {
'display_id': display_id, 'id': '2018-11-12',
'thumbnail': thumbnail, },
'description': description, 'playlist_mincount': 11,
} }, {
'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition',
'only_matching': True,
}, {
'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entries = []
for mobj in re.finditer(
r'data-video\s*=\s*(["\'])(?P<data>{.+?})\1', webpage):
video = self._parse_json(
mobj.group('data'), display_id, transform_source=unescapeHTML,
fatal=False)
if not video:
continue
data = try_get(video, lambda x: x['playlist'][0], dict)
if not data:
continue
try:
entry = self._extract_entry(data, url)
except Exception:
continue
entry['extractor_key'] = WWEIE.ie_key()
entries.append(entry)
return self.playlist_result(entries, display_id)