[extractor] Simplify search extractors
This commit is contained in:
parent
a903d8285c
commit
cc16383ff3
6 changed files with 27 additions and 72 deletions
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import base64
|
import base64
|
||||||
import datetime
|
import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import itertools
|
||||||
import json
|
import json
|
||||||
import netrc
|
import netrc
|
||||||
import os
|
import os
|
||||||
|
@ -3617,7 +3618,14 @@ class SearchInfoExtractor(InfoExtractor):
|
||||||
return self._get_n_results(query, n)
|
return self._get_n_results(query, n)
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
def _get_n_results(self, query, n):
|
||||||
"""Get a specified number of results for a query"""
|
"""Get a specified number of results for a query.
|
||||||
|
Either this function or _search_results must be overridden by subclasses """
|
||||||
|
return self.playlist_result(
|
||||||
|
itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
|
||||||
|
query, query)
|
||||||
|
|
||||||
|
def _search_results(self, query):
|
||||||
|
"""Returns an iterator of search results"""
|
||||||
raise NotImplementedError('This method must be implemented by subclasses')
|
raise NotImplementedError('This method must be implemented by subclasses')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -11,6 +11,7 @@ class GoogleSearchIE(SearchInfoExtractor):
|
||||||
_MAX_RESULTS = 1000
|
_MAX_RESULTS = 1000
|
||||||
IE_NAME = 'video.google:search'
|
IE_NAME = 'video.google:search'
|
||||||
_SEARCH_KEY = 'gvsearch'
|
_SEARCH_KEY = 'gvsearch'
|
||||||
|
_WORKING = False
|
||||||
_TEST = {
|
_TEST = {
|
||||||
'url': 'gvsearch15:python language',
|
'url': 'gvsearch15:python language',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
|
@ -20,16 +21,7 @@ class GoogleSearchIE(SearchInfoExtractor):
|
||||||
'playlist_count': 15,
|
'playlist_count': 15,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
def _search_results(self, query):
|
||||||
"""Get a specified number of results for a query"""
|
|
||||||
|
|
||||||
entries = []
|
|
||||||
res = {
|
|
||||||
'_type': 'playlist',
|
|
||||||
'id': query,
|
|
||||||
'title': query,
|
|
||||||
}
|
|
||||||
|
|
||||||
for pagenum in itertools.count():
|
for pagenum in itertools.count():
|
||||||
webpage = self._download_webpage(
|
webpage = self._download_webpage(
|
||||||
'http://www.google.com/search',
|
'http://www.google.com/search',
|
||||||
|
@ -44,16 +36,8 @@ class GoogleSearchIE(SearchInfoExtractor):
|
||||||
|
|
||||||
for hit_idx, mobj in enumerate(re.finditer(
|
for hit_idx, mobj in enumerate(re.finditer(
|
||||||
r'<h3 class="r"><a href="([^"]+)"', webpage)):
|
r'<h3 class="r"><a href="([^"]+)"', webpage)):
|
||||||
|
if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
|
||||||
|
yield self.url_result(mobj.group(1))
|
||||||
|
|
||||||
# Skip playlists
|
if not re.search(r'id="pnnext"', webpage):
|
||||||
if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
|
return
|
||||||
continue
|
|
||||||
|
|
||||||
entries.append({
|
|
||||||
'_type': 'url',
|
|
||||||
'url': mobj.group(1)
|
|
||||||
})
|
|
||||||
|
|
||||||
if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
|
|
||||||
res['entries'] = entries[:n]
|
|
||||||
return res
|
|
||||||
|
|
|
@ -709,11 +709,9 @@ class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE):
|
||||||
_SEARCH_KEY = 'nicosearch'
|
_SEARCH_KEY = 'nicosearch'
|
||||||
_TESTS = []
|
_TESTS = []
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
def _search_results(self, query):
|
||||||
entries = self._entries(self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
|
return self._entries(
|
||||||
if n < float('inf'):
|
self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
|
||||||
entries = itertools.islice(entries, 0, n)
|
|
||||||
return self.playlist_result(entries, query, query)
|
|
||||||
|
|
||||||
|
|
||||||
class NicovideoSearchDateIE(NicovideoSearchIE):
|
class NicovideoSearchDateIE(NicovideoSearchIE):
|
||||||
|
|
|
@ -880,30 +880,19 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
|
||||||
})
|
})
|
||||||
next_url = update_url_query(self._API_V2_BASE + endpoint, query)
|
next_url = update_url_query(self._API_V2_BASE + endpoint, query)
|
||||||
|
|
||||||
collected_results = 0
|
|
||||||
|
|
||||||
for i in itertools.count(1):
|
for i in itertools.count(1):
|
||||||
response = self._download_json(
|
response = self._download_json(
|
||||||
next_url, collection_id, 'Downloading page {0}'.format(i),
|
next_url, collection_id, f'Downloading page {i}',
|
||||||
'Unable to download API page', headers=self._HEADERS)
|
'Unable to download API page', headers=self._HEADERS)
|
||||||
|
|
||||||
collection = response.get('collection', [])
|
for item in response.get('collection') or []:
|
||||||
if not collection:
|
if item:
|
||||||
break
|
|
||||||
|
|
||||||
collection = list(filter(bool, collection))
|
|
||||||
collected_results += len(collection)
|
|
||||||
|
|
||||||
for item in collection:
|
|
||||||
yield self.url_result(item['uri'], SoundcloudIE.ie_key())
|
yield self.url_result(item['uri'], SoundcloudIE.ie_key())
|
||||||
|
|
||||||
if not collection or collected_results >= limit:
|
|
||||||
break
|
|
||||||
|
|
||||||
next_url = response.get('next_href')
|
next_url = response.get('next_href')
|
||||||
if not next_url:
|
if not next_url:
|
||||||
break
|
break
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
def _get_n_results(self, query, n):
|
||||||
tracks = self._get_collection('search/tracks', query, limit=n, q=query)
|
tracks = self._get_collection('search/tracks', query, limit=n, q=query)
|
||||||
return self.playlist_result(tracks, playlist_title=query)
|
return self.playlist_result(tracks, query, query)
|
||||||
|
|
|
@ -334,30 +334,14 @@ class YahooSearchIE(SearchInfoExtractor):
|
||||||
IE_NAME = 'screen.yahoo:search'
|
IE_NAME = 'screen.yahoo:search'
|
||||||
_SEARCH_KEY = 'yvsearch'
|
_SEARCH_KEY = 'yvsearch'
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
def _search_results(self, query):
|
||||||
"""Get a specified number of results for a query"""
|
|
||||||
entries = []
|
|
||||||
for pagenum in itertools.count(0):
|
for pagenum in itertools.count(0):
|
||||||
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
|
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
|
||||||
info = self._download_json(result_url, query,
|
info = self._download_json(result_url, query,
|
||||||
note='Downloading results page ' + str(pagenum + 1))
|
note='Downloading results page ' + str(pagenum + 1))
|
||||||
m = info['m']
|
yield from (self.url_result(result['rurl']) for result in info['results'])
|
||||||
results = info['results']
|
if info['m']['last'] >= info['m']['total'] - 1:
|
||||||
|
|
||||||
for (i, r) in enumerate(results):
|
|
||||||
if (pagenum * 30) + i >= n:
|
|
||||||
break
|
break
|
||||||
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
|
|
||||||
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
|
|
||||||
entries.append(e)
|
|
||||||
if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
|
|
||||||
break
|
|
||||||
|
|
||||||
return {
|
|
||||||
'_type': 'playlist',
|
|
||||||
'id': query,
|
|
||||||
'entries': entries,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class YahooGyaOPlayerIE(InfoExtractor):
|
class YahooGyaOPlayerIE(InfoExtractor):
|
||||||
|
|
|
@ -4615,11 +4615,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
|
||||||
_SEARCH_PARAMS = None
|
_SEARCH_PARAMS = None
|
||||||
_TESTS = []
|
_TESTS = []
|
||||||
|
|
||||||
def _entries(self, query, n):
|
def _search_results(self, query):
|
||||||
data = {'query': query}
|
data = {'query': query}
|
||||||
if self._SEARCH_PARAMS:
|
if self._SEARCH_PARAMS:
|
||||||
data['params'] = self._SEARCH_PARAMS
|
data['params'] = self._SEARCH_PARAMS
|
||||||
total = 0
|
|
||||||
continuation = {}
|
continuation = {}
|
||||||
for page_num in itertools.count(1):
|
for page_num in itertools.count(1):
|
||||||
data.update(continuation)
|
data.update(continuation)
|
||||||
|
@ -4662,17 +4661,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield self._extract_video(video)
|
yield self._extract_video(video)
|
||||||
total += 1
|
|
||||||
if total == n:
|
|
||||||
return
|
|
||||||
|
|
||||||
if not continuation:
|
if not continuation:
|
||||||
break
|
break
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
|
||||||
"""Get a specified number of results for a query"""
|
|
||||||
return self.playlist_result(self._entries(query, n), query, query)
|
|
||||||
|
|
||||||
|
|
||||||
class YoutubeSearchDateIE(YoutubeSearchIE):
|
class YoutubeSearchDateIE(YoutubeSearchIE):
|
||||||
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
|
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
|
||||||
|
|
Loading…
Reference in a new issue