[mildom] Rework extractors (#2940)

Authored by: Lesmiscore
This commit is contained in:
Lesmiscore (Naoya Ozaki) 2022-03-08 23:49:10 +09:00 committed by GitHub
parent 409cdd1ec9
commit fb6e3f4389
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 135 additions and 157 deletions

View file

@ -848,6 +848,7 @@ from .microsoftvirtualacademy import (
from .mildom import ( from .mildom import (
MildomIE, MildomIE,
MildomVodIE, MildomVodIE,
MildomClipIE,
MildomUserVodIE, MildomUserVodIE,
) )
from .minds import ( from .minds import (

View file

@ -1,102 +1,43 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import base64 import functools
from datetime import datetime
import itertools
import json import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
update_url_query, determine_ext,
random_uuidv4, dict_get,
try_get, ExtractorError,
float_or_none, float_or_none,
dict_get OnDemandPagedList,
) random_uuidv4,
from ..compat import ( traverse_obj,
compat_str, update_url_query,
) )
class MildomBaseIE(InfoExtractor): class MildomBaseIE(InfoExtractor):
_GUEST_ID = None _GUEST_ID = None
_DISPATCHER_CONFIG = None
def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', init=False): def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None):
query = query or {} if not self._GUEST_ID:
if query: self._GUEST_ID = f'pc-gp-{random_uuidv4()}'
query['__platform'] = 'web'
url = update_url_query(url, self._common_queries(query, init=init))
content = self._download_json(url, video_id, note=note)
if content['code'] == 0:
return content['body']
else:
self.raise_no_formats(
f'Video not found or premium content. {content["code"]} - {content["message"]}',
expected=True)
def _common_queries(self, query={}, init=False): content = self._download_json(
dc = self._fetch_dispatcher_config() url, video_id, note=note, data=json.dumps(body).encode() if body else None,
r = { headers={'Content-Type': 'application/json'} if body else {},
'timestamp': self.iso_timestamp(), query={
'__guest_id': '' if init else self.guest_id(), '__guest_id': self._GUEST_ID,
'__location': dc['location'],
'__country': dc['country'],
'__cluster': dc['cluster'],
'__platform': 'web', '__platform': 'web',
'__la': self.lang_code(), **(query or {}),
'__pcv': 'v2.9.44', })
'sfr': 'pc',
'accessToken': '',
}
r.update(query)
return r
def _fetch_dispatcher_config(self): if content['code'] != 0:
if not self._DISPATCHER_CONFIG: raise ExtractorError(
tmp = self._download_json( f'Mildom says: {content["message"]} (code {content["code"]})',
'https://disp.mildom.com/serverListV2', 'initialization', expected=True)
note='Downloading dispatcher_config', data=json.dumps({ return content['body']
'protover': 0,
'data': base64.b64encode(json.dumps({
'fr': 'web',
'sfr': 'pc',
'devi': 'Windows',
'la': 'ja',
'gid': None,
'loc': '',
'clu': '',
'wh': '1919*810',
'rtm': self.iso_timestamp(),
'ua': self.get_param('http_headers')['User-Agent'],
}).encode('utf8')).decode('utf8').replace('\n', ''),
}).encode('utf8'))
self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization')
return self._DISPATCHER_CONFIG
@staticmethod
def iso_timestamp():
'new Date().toISOString()'
return datetime.utcnow().isoformat()[0:-3] + 'Z'
def guest_id(self):
'getGuestId'
if self._GUEST_ID:
return self._GUEST_ID
self._GUEST_ID = try_get(
self, (
lambda x: x._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization',
note='Downloading guest token', init=True)['guest_id'] or None,
lambda x: x._get_cookies('https://www.mildom.com').get('gid').value,
lambda x: x._get_cookies('https://m.mildom.com').get('gid').value,
), compat_str) or ''
return self._GUEST_ID
def lang_code(self):
'getCurrentLangCode'
return 'ja'
class MildomIE(MildomBaseIE): class MildomIE(MildomBaseIE):
@ -106,31 +47,13 @@ class MildomIE(MildomBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
url = 'https://www.mildom.com/%s' % video_id webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id)
webpage = self._download_webpage(url, video_id)
enterstudio = self._call_api( enterstudio = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id,
note='Downloading live metadata', query={'user_id': video_id}) note='Downloading live metadata', query={'user_id': video_id})
result_video_id = enterstudio.get('log_id', video_id) result_video_id = enterstudio.get('log_id', video_id)
title = try_get(
enterstudio, (
lambda x: self._html_search_meta('twitter:description', webpage),
lambda x: x['anchor_intro'],
), compat_str)
description = try_get(
enterstudio, (
lambda x: x['intro'],
lambda x: x['live_intro'],
), compat_str)
uploader = try_get(
enterstudio, (
lambda x: self._html_search_meta('twitter:title', webpage),
lambda x: x['loginname'],
), compat_str)
servers = self._call_api( servers = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id,
note='Downloading live server list', query={ note='Downloading live server list', query={
@ -138,17 +61,20 @@ class MildomIE(MildomBaseIE):
'live_server_type': 'hls', 'live_server_type': 'hls',
}) })
stream_query = self._common_queries({ playback_token = self._call_api(
'streamReqId': random_uuidv4(), 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id,
'is_lhls': '0', note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'})
}) playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False)
m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query) if not playback_token:
formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={ raise ExtractorError('Failed to obtain live playback token')
formats = self._extract_m3u8_formats(
f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}',
result_video_id, 'mp4', headers={
'Referer': 'https://www.mildom.com/', 'Referer': 'https://www.mildom.com/',
'Origin': 'https://www.mildom.com', 'Origin': 'https://www.mildom.com',
}, note='Downloading m3u8 information') })
del stream_query['streamReqId'], stream_query['timestamp']
for fmt in formats: for fmt in formats:
fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/'
@ -156,10 +82,10 @@ class MildomIE(MildomBaseIE):
return { return {
'id': result_video_id, 'id': result_video_id,
'title': title, 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'),
'description': description, 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str),
'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000),
'uploader': uploader, 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'),
'uploader_id': video_id, 'uploader_id': video_id,
'formats': formats, 'formats': formats,
'is_live': True, 'is_live': True,
@ -168,7 +94,7 @@ class MildomIE(MildomBaseIE):
class MildomVodIE(MildomBaseIE): class MildomVodIE(MildomBaseIE):
IE_NAME = 'mildom:vod' IE_NAME = 'mildom:vod'
IE_DESC = 'Download a VOD in Mildom' IE_DESC = 'VOD in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269',
@ -215,11 +141,8 @@ class MildomVodIE(MildomBaseIE):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
m = self._match_valid_url(url) user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
user_id, video_id = m.group('user_id'), m.group('id') webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id)
url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id)
webpage = self._download_webpage(url, video_id)
autoplay = self._call_api( autoplay = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id,
@ -227,20 +150,6 @@ class MildomVodIE(MildomBaseIE):
'v_id': video_id, 'v_id': video_id,
})['playback'] })['playback']
title = try_get(
autoplay, (
lambda x: self._html_search_meta('og:description', webpage),
lambda x: x['title'],
), compat_str)
description = try_get(
autoplay, (
lambda x: x['video_intro'],
), compat_str)
uploader = try_get(
autoplay, (
lambda x: x['author_info']['login_name'],
), compat_str)
formats = [{ formats = [{
'url': autoplay['audio_url'], 'url': autoplay['audio_url'],
'format_id': 'audio', 'format_id': 'audio',
@ -265,17 +174,81 @@ class MildomVodIE(MildomBaseIE):
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'),
'description': description, 'description': traverse_obj(autoplay, 'video_intro'),
'timestamp': float_or_none(autoplay['publish_time'], scale=1000), 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000),
'duration': float_or_none(autoplay['video_length'], scale=1000), 'duration': float_or_none(autoplay.get('video_length'), scale=1000),
'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')),
'uploader': uploader, 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')),
'uploader_id': user_id, 'uploader_id': user_id,
'formats': formats, 'formats': formats,
} }
class MildomClipIE(MildomBaseIE):
IE_NAME = 'mildom:clip'
IE_DESC = 'Clip in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9',
'info_dict': {
'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9',
'title': '全然違ったよ',
'timestamp': 1619181890,
'duration': 59,
'thumbnail': r're:https?://.+',
'uploader': 'ざきんぽ',
'uploader_id': '10042245',
},
}, {
'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864',
'info_dict': {
'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864',
'title': 'かっこいい',
'timestamp': 1621094003,
'duration': 59,
'thumbnail': r're:https?://.+',
'uploader': '(ルーキー',
'uploader_id': '10111524',
},
}, {
'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
'info_dict': {
'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
'title': '',
'timestamp': 1614769431,
'duration': 31,
'thumbnail': r're:https?://.+',
'uploader': 'ドルゴルスレンギーン=ダグワドルジ',
'uploader_id': '10660174',
},
}]
def _real_extract(self, url):
user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id)
clip_detail = self._call_api(
'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id,
note='Downloading playback metadata', query={
'clip_id': video_id,
})
return {
'id': video_id,
'title': self._html_search_meta(
('og:description', 'description'), webpage, default=None) or clip_detail.get('title'),
'timestamp': float_or_none(clip_detail.get('create_time')),
'duration': float_or_none(clip_detail.get('length')),
'thumbnail': clip_detail.get('cover'),
'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')),
'uploader_id': user_id,
'url': clip_detail['url'],
'ext': determine_ext(clip_detail.get('url'), 'mp4'),
}
class MildomUserVodIE(MildomBaseIE): class MildomUserVodIE(MildomBaseIE):
IE_NAME = 'mildom:user:vod' IE_NAME = 'mildom:user:vod'
IE_DESC = 'Download all VODs from specific user in Mildom' IE_DESC = 'Download all VODs from specific user in Mildom'
@ -286,29 +259,32 @@ class MildomUserVodIE(MildomBaseIE):
'id': '10093333', 'id': '10093333',
'title': 'Uploads from ねこばたけ', 'title': 'Uploads from ねこばたけ',
}, },
'playlist_mincount': 351, 'playlist_mincount': 732,
}, { }, {
'url': 'https://www.mildom.com/profile/10882672', 'url': 'https://www.mildom.com/profile/10882672',
'info_dict': { 'info_dict': {
'id': '10882672', 'id': '10882672',
'title': 'Uploads from kson組長(けいそん)', 'title': 'Uploads from kson組長(けいそん)',
}, },
'playlist_mincount': 191, 'playlist_mincount': 201,
}] }]
def _entries(self, user_id): def _fetch_page(self, user_id, page):
for page in itertools.count(1): page += 1
reply = self._call_api( reply = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
user_id, note='Downloading page %d' % page, query={ user_id, note=f'Downloading page {page}', query={
'user_id': user_id, 'user_id': user_id,
'page': page, 'page': page,
'limit': '30', 'limit': '30',
}) })
if not reply: if not reply:
break return
for x in reply: for x in reply:
yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id'])) v_id = x.get('v_id')
if not v_id:
continue
yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}')
def _real_extract(self, url): def _real_extract(self, url):
user_id = self._match_id(url) user_id = self._match_id(url)
@ -319,4 +295,5 @@ class MildomUserVodIE(MildomBaseIE):
query={'user_id': user_id}, note='Downloading user profile')['user_info'] query={'user_id': user_id}, note='Downloading user profile')['user_info']
return self.playlist_result( return self.playlist_result(
self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname']) OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30),
user_id, f'Uploads from {profile["loginname"]}')