[extractor/substack] Add extractor (#4011)

Closes #3722
Authored by: elyse0
This commit is contained in:
Elyse 2022-06-18 19:08:53 -05:00 committed by GitHub
parent 7a2e40dd48
commit 612e31f5ea
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 135 additions and 1 deletions

View file

@ -1640,6 +1640,7 @@ from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE from .stretchinternet import StretchInternetIE
from .stripchat import StripchatIE from .stripchat import StripchatIE
from .stv import STVPlayerIE from .stv import STVPlayerIE
from .substack import SubstackIE
from .sunporno import SunPornoIE from .sunporno import SunPornoIE
from .sverigesradio import ( from .sverigesradio import (
SverigesRadioEpisodeIE, SverigesRadioEpisodeIE,

View file

@ -69,6 +69,7 @@ from .spankwire import SpankwireIE
from .sportbox import SportBoxIE from .sportbox import SportBoxIE
from .spotify import SpotifyBaseIE from .spotify import SpotifyBaseIE
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .substack import SubstackIE
from .svt import SVTIE from .svt import SVTIE
from .teachable import TeachableIE from .teachable import TeachableIE
from .ted import TedEmbedIE from .ted import TedEmbedIE
@ -2542,7 +2543,34 @@ class GenericIE(InfoExtractor):
'timestamp': 1652833414, 'timestamp': 1652833414,
'age_limit': 0, 'age_limit': 0,
} }
}, { },
{
'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details',
'md5': '198bde8bed23d0b23c70725c83c9b6d9',
'info_dict': {
'id': '53602801',
'ext': 'mpga',
'title': 'Interstellar',
'description': 'Listen now | Episode One',
'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538',
'uploader': 'Molly Movie Club',
'uploader_id': '839621',
},
},
{
'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r',
'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0',
'info_dict': {
'id': '57962052',
'ext': 'mpga',
'title': 'md5:855b2756f0ee10f6723fa00b16266f8d',
'description': 'md5:fe512a5e94136ad260c80bde00ea4eef',
'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59',
'uploader': 'Blocked and Reported',
'uploader_id': '500230',
},
},
{
'url': 'https://www.skimag.com/video/ski-people-1980/', 'url': 'https://www.skimag.com/video/ski-people-1980/',
'info_dict': { 'info_dict': {
'id': 'ski-people-1980', 'id': 'ski-people-1980',
@ -3107,6 +3135,11 @@ class GenericIE(InfoExtractor):
# Don't set the extractor because it can be a track url or an album # Don't set the extractor because it can be a track url or an album
return self.url_result(burl) return self.url_result(burl)
# Check for Substack custom domains
substack_url = SubstackIE._extract_url(webpage, url)
if substack_url:
return self.url_result(substack_url, SubstackIE)
# Look for embedded Vevo player # Look for embedded Vevo player
mobj = re.search( mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)

View file

@ -0,0 +1,100 @@
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import str_or_none, traverse_obj
class SubstackIE(InfoExtractor):
_VALID_URL = r'https?://(?P<username>[\w-]+)\.substack\.com/p/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://haleynahman.substack.com/p/i-made-a-vlog?s=r',
'md5': 'f27e4fc6252001d48d479f45e65cdfd5',
'info_dict': {
'id': '47660949',
'ext': 'mp4',
'title': 'I MADE A VLOG',
'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6',
'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18',
'uploader': 'Maybe Baby',
'uploader_id': '33628',
}
}, {
'url': 'https://haleynahman.substack.com/p/-dear-danny-i-found-my-boyfriends?s=r',
'md5': '0a63eacec877a1171a62cfa69710fcea',
'info_dict': {
'id': '51045592',
'ext': 'mpga',
'title': "🎧 Dear Danny: I found my boyfriend's secret Twitter account",
'description': 'md5:a57f2439319e56e0af92dd0c95d75797',
'thumbnail': 'md5:daa40b6b79249417c14ff8103db29639',
'uploader': 'Maybe Baby',
'uploader_id': '33628',
}
}, {
'url': 'https://andrewzimmern.substack.com/p/mussels-with-black-bean-sauce-recipe',
'md5': 'fd3c07077b02444ff0130715b5f632bb',
'info_dict': {
'id': '47368578',
'ext': 'mp4',
'title': 'Mussels with Black Bean Sauce: Recipe of the Week #7',
'description': 'md5:b96234a2906c7d854d5229818d889515',
'thumbnail': 'md5:e30bfaa9da40e82aa62354263a9dd232',
'uploader': "Andrew Zimmern's Spilled Milk ",
'uploader_id': '577659',
}
}]
@classmethod
def _extract_url(cls, webpage, url):
if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage):
return
mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage)
if mobj:
parsed = urllib.parse.urlparse(url)
return parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl()
def _extract_video_formats(self, video_id, username):
formats, subtitles = [], {}
for video_format in ('hls', 'mp4'):
video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}'
if video_format == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': video_url,
'ext': video_format,
})
return formats, subtitles
def _real_extract(self, url):
display_id, username = self._match_valid_url(url).group('id', 'username')
webpage = self._download_webpage(url, display_id)
webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id)
post_type = webpage_info['post']['type']
formats, subtitles = [], {}
if post_type == 'podcast':
formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {}
elif post_type == 'video':
formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username)
else:
self.raise_no_formats(f'Page type "{post_type}" is not supported')
self._sort_formats(formats)
return {
'id': str(webpage_info['post']['id']),
'formats': formats,
'subtitles': subtitles,
'title': traverse_obj(webpage_info, ('post', 'title')),
'description': traverse_obj(webpage_info, ('post', 'description')),
'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')),
'uploader': traverse_obj(webpage_info, ('pub', 'name')),
'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))),
}