[ie/douyutv] Fix extractors (#7652)
Closes #2494, Closes #7295 Authored by: c-basalt
This commit is contained in:
parent
b3febedbeb
commit
21f40e75df
1 changed files with 184 additions and 89 deletions
|
@ -1,31 +1,72 @@
|
||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
import re
|
|
||||||
import urllib
|
import urllib
|
||||||
|
import uuid
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
from .openload import PhantomJSwrapper
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
|
UserNotLive,
|
||||||
|
determine_ext,
|
||||||
|
int_or_none,
|
||||||
|
js_to_json,
|
||||||
|
parse_resolution,
|
||||||
|
str_or_none,
|
||||||
|
traverse_obj,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
unified_strdate,
|
url_or_none,
|
||||||
|
urlencode_postdata,
|
||||||
urljoin,
|
urljoin,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class DouyuTVIE(InfoExtractor):
|
class DouyuBaseIE(InfoExtractor):
|
||||||
IE_DESC = '斗鱼'
|
def _download_cryptojs_md5(self, video_id):
|
||||||
|
for url in [
|
||||||
|
'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
|
||||||
|
'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
|
||||||
|
]:
|
||||||
|
js_code = self._download_webpage(
|
||||||
|
url, video_id, note='Downloading signing dependency', fatal=False)
|
||||||
|
if js_code:
|
||||||
|
self.cache.store('douyu', 'crypto-js-md5', js_code)
|
||||||
|
return js_code
|
||||||
|
raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
|
||||||
|
|
||||||
|
def _get_cryptojs_md5(self, video_id):
|
||||||
|
return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)
|
||||||
|
|
||||||
|
def _calc_sign(self, sign_func, video_id, a):
|
||||||
|
b = uuid.uuid4().hex
|
||||||
|
c = round(time.time())
|
||||||
|
js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
|
||||||
|
phantom = PhantomJSwrapper(self)
|
||||||
|
result = phantom.execute(js_script, video_id,
|
||||||
|
note='Executing JS signing script').strip()
|
||||||
|
return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
|
||||||
|
|
||||||
|
def _search_js_sign_func(self, webpage, fatal=True):
|
||||||
|
# The greedy look-behind ensures last possible script tag is matched
|
||||||
|
return self._search_regex(
|
||||||
|
r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal)
|
||||||
|
|
||||||
|
|
||||||
|
class DouyuTVIE(DouyuBaseIE):
|
||||||
|
IE_DESC = '斗鱼直播'
|
||||||
_VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
|
_VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'http://www.douyutv.com/iseven',
|
'url': 'https://www.douyu.com/pigff',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '17732',
|
'id': '24422',
|
||||||
'display_id': 'iseven',
|
'display_id': 'pigff',
|
||||||
'ext': 'flv',
|
'ext': 'mp4',
|
||||||
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
|
'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
|
||||||
'description': r're:.*m7show@163\.com.*',
|
'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
|
||||||
'thumbnail': r're:^https?://.*\.png',
|
'thumbnail': str,
|
||||||
'uploader': '7师傅',
|
'uploader': 'pigff',
|
||||||
'is_live': True,
|
'is_live': True,
|
||||||
|
'live_status': 'is_live',
|
||||||
},
|
},
|
||||||
'params': {
|
'params': {
|
||||||
'skip_download': True,
|
'skip_download': True,
|
||||||
|
@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor):
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
def _get_sign_func(self, room_id, video_id):
|
||||||
|
return self._download_json(
|
||||||
|
f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
|
||||||
|
note='Getting signing script')['data'][f'room{room_id}']
|
||||||
|
|
||||||
|
def _extract_stream_formats(self, stream_formats):
|
||||||
|
formats = []
|
||||||
|
for stream_info in traverse_obj(stream_formats, (..., 'data')):
|
||||||
|
stream_url = urljoin(
|
||||||
|
traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
|
||||||
|
if stream_url:
|
||||||
|
rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
|
||||||
|
rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False)
|
||||||
|
ext = determine_ext(stream_url)
|
||||||
|
formats.append({
|
||||||
|
'url': stream_url,
|
||||||
|
'format_id': str_or_none(rate_id),
|
||||||
|
'ext': 'mp4' if ext == 'm3u8' else ext,
|
||||||
|
'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
|
||||||
|
'quality': rate_id % -10000 if rate_id is not None else None,
|
||||||
|
**traverse_obj(rate_info, {
|
||||||
|
'format': ('name', {str_or_none}),
|
||||||
|
'tbr': ('bit', {int_or_none}),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
return formats
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
|
|
||||||
if video_id.isdigit():
|
webpage = self._download_webpage(url, video_id)
|
||||||
room_id = video_id
|
room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')
|
||||||
else:
|
|
||||||
page = self._download_webpage(url, video_id)
|
if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1':
|
||||||
room_id = self._html_search_regex(
|
raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
|
||||||
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
|
if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2':
|
||||||
|
raise UserNotLive(video_id=video_id)
|
||||||
|
|
||||||
# Grab metadata from API
|
# Grab metadata from API
|
||||||
params = {
|
params = {
|
||||||
|
@ -102,110 +171,136 @@ class DouyuTVIE(InfoExtractor):
|
||||||
'time': int(time.time()),
|
'time': int(time.time()),
|
||||||
}
|
}
|
||||||
params['auth'] = hashlib.md5(
|
params['auth'] = hashlib.md5(
|
||||||
f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
|
f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
|
||||||
room = self._download_json(
|
room = traverse_obj(self._download_json(
|
||||||
f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
|
f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
|
||||||
note='Downloading room info', query=params)['data']
|
note='Downloading room info', query=params, fatal=False), 'data')
|
||||||
|
|
||||||
# 1 = live, 2 = offline
|
# 1 = live, 2 = offline
|
||||||
if room.get('show_status') == '2':
|
if traverse_obj(room, 'show_status') == '2':
|
||||||
raise ExtractorError('Live stream is offline', expected=True)
|
raise UserNotLive(video_id=video_id)
|
||||||
|
|
||||||
video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL'))
|
js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id)
|
||||||
formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id)
|
form_data = {
|
||||||
|
'rate': 0,
|
||||||
|
**self._calc_sign(js_sign_func, video_id, room_id),
|
||||||
|
}
|
||||||
|
stream_formats = [self._download_json(
|
||||||
|
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
|
||||||
|
video_id, note="Downloading livestream format",
|
||||||
|
data=urlencode_postdata(form_data))]
|
||||||
|
|
||||||
title = unescapeHTML(room['room_name'])
|
for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
|
||||||
description = room.get('show_details')
|
if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
|
||||||
thumbnail = room.get('room_src')
|
form_data['rate'] = rate_id
|
||||||
uploader = room.get('nickname')
|
stream_formats.append(self._download_json(
|
||||||
|
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
|
||||||
|
video_id, note=f'Downloading livestream format {rate_id}',
|
||||||
|
data=urlencode_postdata(form_data)))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': room_id,
|
'id': room_id,
|
||||||
'display_id': video_id,
|
'formats': self._extract_stream_formats(stream_formats),
|
||||||
'title': title,
|
|
||||||
'description': description,
|
|
||||||
'thumbnail': thumbnail,
|
|
||||||
'uploader': uploader,
|
|
||||||
'is_live': True,
|
'is_live': True,
|
||||||
'subtitles': subs,
|
**traverse_obj(room, {
|
||||||
'formats': formats,
|
'display_id': ('url', {str}, {lambda i: i[1:]}),
|
||||||
|
'title': ('room_name', {unescapeHTML}),
|
||||||
|
'description': ('show_details', {str}),
|
||||||
|
'uploader': ('nickname', {str}),
|
||||||
|
'thumbnail': ('room_src', {url_or_none}),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class DouyuShowIE(InfoExtractor):
|
class DouyuShowIE(DouyuBaseIE):
|
||||||
_VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
|
_VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
|
||||||
|
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
|
'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
|
||||||
'md5': '0c2cfd068ee2afe657801269b2d86214',
|
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'rjNBdvnVXNzvE2yw',
|
'id': 'mPyq7oVNe5Yv1gLY',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
|
'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃',
|
||||||
'duration': 7150.08,
|
'duration': 633,
|
||||||
'thumbnail': r're:^https?://.*\.jpg$',
|
'thumbnail': str,
|
||||||
'uploader': '陈一发儿',
|
'uploader': '美食作家王刚V',
|
||||||
'uploader_id': 'XrZwYelr5wbK',
|
'uploader_id': 'OVAO4NVx1m7Q',
|
||||||
'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
|
'timestamp': 1661850002,
|
||||||
'upload_date': '20170402',
|
'upload_date': '20220830',
|
||||||
|
'view_count': int,
|
||||||
|
'tags': ['美食', '美食综合'],
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
|
'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
_FORMATS = {
|
||||||
|
'super': '原画',
|
||||||
|
'high': '超清',
|
||||||
|
'normal': '高清',
|
||||||
|
}
|
||||||
|
|
||||||
|
_QUALITIES = {
|
||||||
|
'super': -1,
|
||||||
|
'high': -2,
|
||||||
|
'normal': -3,
|
||||||
|
}
|
||||||
|
|
||||||
|
_RESOLUTIONS = {
|
||||||
|
'super': '1920x1080',
|
||||||
|
'high': '1280x720',
|
||||||
|
'normal': '852x480',
|
||||||
|
}
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
url = url.replace('vmobile.', 'v.')
|
url = url.replace('vmobile.', 'v.')
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
room_info = self._parse_json(self._search_regex(
|
video_info = self._search_json(
|
||||||
r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
|
r'<script>\s*window\.\$DATA\s*=', webpage,
|
||||||
|
'video info', video_id, transform_source=js_to_json)
|
||||||
|
|
||||||
video_info = None
|
js_sign_func = self._search_js_sign_func(webpage)
|
||||||
|
form_data = {
|
||||||
|
'vid': video_id,
|
||||||
|
**self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']),
|
||||||
|
}
|
||||||
|
url_info = self._download_json(
|
||||||
|
'https://v.douyu.com/api/stream/getStreamUrl', video_id,
|
||||||
|
data=urlencode_postdata(form_data), note="Downloading video formats")
|
||||||
|
|
||||||
for trial in range(5):
|
formats = []
|
||||||
# Sometimes Douyu rejects our request. Let's try it more times
|
for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)):
|
||||||
try:
|
video_url = traverse_obj(url, ('url', {url_or_none}))
|
||||||
video_info = self._download_json(
|
if video_url:
|
||||||
'https://vmobile.douyu.com/video/getInfo', video_id,
|
ext = determine_ext(video_url)
|
||||||
query={'vid': video_id},
|
formats.append({
|
||||||
headers={
|
'format': self._FORMATS.get(name),
|
||||||
'Referer': url,
|
'format_id': name,
|
||||||
'x-requested-with': 'XMLHttpRequest',
|
'url': video_url,
|
||||||
})
|
'quality': self._QUALITIES.get(name),
|
||||||
break
|
'ext': 'mp4' if ext == 'm3u8' else ext,
|
||||||
except ExtractorError:
|
'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
|
||||||
self._sleep(1, video_id)
|
**parse_resolution(self._RESOLUTIONS.get(name))
|
||||||
|
})
|
||||||
if not video_info:
|
else:
|
||||||
raise ExtractorError('Can\'t fetch video info')
|
self.to_screen(
|
||||||
|
f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')
|
||||||
formats = self._extract_m3u8_formats(
|
|
||||||
video_info['data']['video_url'], video_id,
|
|
||||||
entry_protocol='m3u8_native', ext='mp4')
|
|
||||||
|
|
||||||
upload_date = unified_strdate(self._html_search_regex(
|
|
||||||
r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
|
|
||||||
'upload date', fatal=False))
|
|
||||||
|
|
||||||
uploader = uploader_id = uploader_url = None
|
|
||||||
mobj = re.search(
|
|
||||||
r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
|
|
||||||
webpage)
|
|
||||||
if mobj:
|
|
||||||
uploader_id, uploader = mobj.groups()
|
|
||||||
uploader_url = urljoin(url, '/author/' + uploader_id)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'title': room_info['name'],
|
|
||||||
'formats': formats,
|
'formats': formats,
|
||||||
'duration': room_info.get('duration'),
|
**traverse_obj(video_info, ('DATA', {
|
||||||
'thumbnail': room_info.get('pic'),
|
'title': ('content', 'title', {str}),
|
||||||
'upload_date': upload_date,
|
'uploader': ('content', 'author', {str}),
|
||||||
'uploader': uploader,
|
'uploader_id': ('content', 'up_id', {str_or_none}),
|
||||||
'uploader_id': uploader_id,
|
'duration': ('content', 'video_duration', {int_or_none}),
|
||||||
'uploader_url': uploader_url,
|
'thumbnail': ('content', 'video_pic', {url_or_none}),
|
||||||
|
'timestamp': ('content', 'create_time', {int_or_none}),
|
||||||
|
'view_count': ('content', 'view_num', {int_or_none}),
|
||||||
|
'tags': ('videoTag', ..., 'tagName', {str}),
|
||||||
|
}))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue