[ie/facebook] Fix webpage extraction (#7890)

Closes #7901
Authored by: ringus1
This commit is contained in:
ringus1 2023-09-05 22:35:23 +02:00 committed by GitHub
parent 99c99c7185
commit d3d81cc98f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor):
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
_TESTS = [{ _TESTS = [{
'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/',
'info_dict': {
'id': '3676516585958356',
'ext': 'mp4',
'title': 'dr Adam Przygoda',
'description': 'md5:34675bda53336b1d16400265c2bb9b3b',
'uploader': 'RADIO KICKS FM',
'upload_date': '20230818',
'timestamp': 1692346159,
'thumbnail': r're:^https?://.*',
'uploader_id': '100063551323670',
'duration': 3132.184,
'view_count': int,
'concurrent_view_count': 0,
},
}, {
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
'md5': '6a40d33c0eccbb1af76cf0485a052659', 'md5': '6a40d33c0eccbb1af76cf0485a052659',
'info_dict': { 'info_dict': {
@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20140506', 'upload_date': '20140506',
'timestamp': 1399398998, 'timestamp': 1399398998,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl',
'duration': 131.03, 'duration': 131.03,
'concurrent_view_count': int, 'concurrent_view_count': int,
}, },
@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor):
'timestamp': 1486648217, 'timestamp': 1486648217,
'upload_date': '20170209', 'upload_date': '20170209',
'uploader': 'Yaroslav Korpan', 'uploader': 'Yaroslav Korpan',
'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl',
'concurrent_view_count': int, 'concurrent_view_count': int,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'view_count': int, 'view_count': int,
@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor):
'title': 'Josef', 'title': 'Josef',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'concurrent_view_count': int, 'concurrent_view_count': int,
'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl',
'timestamp': 1549275572, 'timestamp': 1549275572,
'duration': 3.413, 'duration': 3.413,
'uploader': 'Josef Novak', 'uploader': 'Josef Novak',
@ -401,9 +417,9 @@ class FacebookIE(InfoExtractor):
def extract_metadata(webpage): def extract_metadata(webpage):
post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
post = traverse_obj(post_data, ( post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text')) title = get_first(media, ('title', 'text'))
@ -493,14 +509,14 @@ class FacebookIE(InfoExtractor):
def extract_relay_data(_filter): def extract_relay_data(_filter):
return self._parse_json(self._search_regex( return self._parse_json(self._search_regex(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, r'data-sjs>({.*?%s.*?})</script>' % _filter,
webpage, 'replay data', default='{}'), video_id, fatal=False) or {} webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
def extract_relay_prefetched_data(_filter): def extract_relay_prefetched_data(_filter):
replay_data = extract_relay_data(_filter) return traverse_obj(extract_relay_data(_filter), (
for require in (replay_data.get('require') or []): 'require', (None, (..., ..., ..., '__bbox', 'require')),
if require[0] == 'RelayPrefetchedStreamCache': lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ...,
return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} '__bbox', 'result', 'data', {dict}), get_all=False) or {}
if not video_data: if not video_data:
server_js_data = self._parse_json(self._search_regex([ server_js_data = self._parse_json(self._search_regex([
@ -511,7 +527,7 @@ class FacebookIE(InfoExtractor):
if not video_data: if not video_data:
data = extract_relay_prefetched_data( data = extract_relay_prefetched_data(
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
if data: if data:
entries = [] entries = []
@ -526,7 +542,8 @@ class FacebookIE(InfoExtractor):
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', '')): ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
playable_url = video.get(key) playable_url = video.get(key)
if not playable_url: if not playable_url:
continue continue