Skip to content

Commit f51b71f

Browse files
author
Ronald Schmidt
committed
Fixes NikolaiT#149 plus some styles
1 parent 77a7aa1 commit f51b71f

4 files changed

Lines changed: 48 additions & 54 deletions

File tree

GoogleScraper/core.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
class WrongConfigurationError(Exception):
2828
pass
2929

30+
3031
def id_for_keywords(keywords):
3132
"""Determine a unique id for the keywords.
3233
@@ -97,7 +98,8 @@ def start_python_console(namespace=None, noipython=False, banner=''):
9798
except ImportError:
9899
pass
99100
else:
100-
import rlcompleter
101+
pass
102+
# import rlcompleter
101103

102104
readline.parse_and_bind("tab:complete")
103105
code.interact(banner=banner, local=namespace)
@@ -202,7 +204,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
202204
proxy_db = config.get('mysql_proxy_db', '')
203205

204206
# when no search engine is specified, use google
205-
search_engines = config.get('search_engines', ['google',])
207+
search_engines = config.get('search_engines', ['google'])
206208
if not isinstance(search_engines, list):
207209
if search_engines == '*':
208210
search_engines = config.get('supported_search_engines')
@@ -238,8 +240,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
238240
if not (keyword or keywords) and not kwfile:
239241
# Just print the help.
240242
get_command_line(True)
241-
print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
242-
'keyword with --keyword.')
243+
print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.')
243244
return
244245

245246
cache_manager = CacheManager(config)
@@ -456,4 +457,4 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
456457
session.commit()
457458

458459
if return_results:
459-
return scraper_search
460+
return session

GoogleScraper/parsing.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ class Parser():
3030
"""Parses SERP pages.
3131
3232
Each search engine results page (SERP) has a similar layout:
33-
33+
3434
The main search results are usually in a html container element (#main, .results, #leftSide).
35-
There might be separate columns for other search results (like ads for example). Then each
35+
There might be separate columns for other search results (like ads for example). Then each
3636
result contains basically a link, a snippet and a description (usually some text on the
3737
target site). It's really astonishing how similar other search engines are to Google.
38-
38+
3939
Each child class (that can actual parse a concrete search engine results page) needs
4040
to specify css selectors for the different search types (Like normal search, news search, video search, ...).
4141
@@ -73,10 +73,10 @@ def __init__(self, config={}, html='', query=''):
7373
"""Create new Parser instance and parse all information.
7474
7575
Args:
76-
html: The raw html from the search engine search. If not provided, you can parse
76+
html: The raw html from the search engine search. If not provided, you can parse
7777
the data later by calling parse(html) directly.
7878
searchtype: The search type. By default "normal"
79-
79+
8080
Raises:
8181
Assertion error if the subclassed
8282
specific parser cannot handle the the settings.
@@ -109,8 +109,8 @@ def __init__(self, config={}, html='', query=''):
109109

110110
def parse(self, html=None):
111111
"""Public function to start parsing the search engine results.
112-
113-
Args:
112+
113+
Args:
114114
html: The raw html data to extract the SERP entries from.
115115
"""
116116
if html:
@@ -137,7 +137,7 @@ def _parse_lxml(self, cleaner=None):
137137

138138
def _parse(self, cleaner=None):
139139
"""Internal parse the dom according to the provided css selectors.
140-
140+
141141
Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found.
142142
"""
143143
self.num_results = 0
@@ -152,8 +152,7 @@ def _parse(self, cleaner=None):
152152

153153
self.num_results_for_query = self.first_match(num_results_selector, self.dom)
154154
if not self.num_results_for_query:
155-
logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__,
156-
num_results_selector))
155+
logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__, num_results_selector))
157156

158157
# get the current page we are at. Sometimes we search engines don't show this.
159158
try:
@@ -180,7 +179,7 @@ def _parse(self, cleaner=None):
180179

181180
self.search_results[result_type] = []
182181

183-
for selector_specific, selectors in selector_class.items():
182+
for _, selectors in selector_class.items():
184183

185184
if 'result_container' in selectors and selectors['result_container']:
186185
css = '{container} {result_container}'.format(**selectors)
@@ -272,14 +271,14 @@ def first_match(self, selectors, element):
272271
match = self.advanced_css(selector, element=element)
273272
if match:
274273
return match
275-
except IndexError as e:
274+
except IndexError:
276275
pass
277276

278277
return False
279278

280279
def after_parsing(self):
281280
"""Subclass specific behaviour after parsing happened.
282-
281+
283282
Override in subclass to add search engine specific behaviour.
284283
Commonly used to clean the results.
285284
"""
@@ -312,7 +311,7 @@ def iter_serp_items(self):
312311

313312

314313
"""
315-
Here follow the different classes that provide CSS selectors
314+
Here follow the different classes that provide CSS selectors
316315
for different types of SERP pages of several common search engines.
317316
318317
Just look at them and add your own selectors in a new class if you
@@ -404,7 +403,7 @@ class GoogleParser(Parser):
404403
image_search_selectors = {
405404
'results': {
406405
'de_ip': {
407-
'container': 'li#isr_mc',
406+
'container': '#isr_mc',
408407
'result_container': 'div.rg_di',
409408
'link': 'a.rg_l::attr(href)'
410409
},
@@ -422,12 +421,12 @@ def __init__(self, *args, **kwargs):
422421

423422
def after_parsing(self):
424423
"""Clean the urls.
425-
424+
426425
A typical scraped results looks like the following:
427-
426+
428427
'/url?q=http://www.youtube.com/user/Apple&sa=U&ei=\
429428
lntiVN7JDsTfPZCMgKAO&ved=0CFQQFjAO&usg=AFQjCNGkX65O-hKLmyq1FX9HQqbb9iYn9A'
430-
429+
431430
Clean with a short regex.
432431
"""
433432
super().after_parsing()
@@ -543,11 +542,10 @@ def after_parsing(self):
543542
try:
544543
i = self.html.index(substr)
545544
if i:
546-
self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i+len(self.query) + 150]).group()
545+
self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i + len(self.query) + 150]).group()
547546
except Exception as e:
548547
logger.debug(str(e))
549548

550-
551549
if self.searchtype == 'image':
552550
for key, i in self.iter_serp_items():
553551
for regex in (
@@ -626,7 +624,7 @@ class BingParser(Parser):
626624
'ch_ip': {
627625
'container': '#dg_c .imgres',
628626
'result_container': '.dg_u',
629-
'link': 'a.dv_i::attr(m)'
627+
'link': 'a::attr(m)'
630628
},
631629
}
632630
}
@@ -1049,12 +1047,12 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None,
10491047

10501048
if __name__ == '__main__':
10511049
"""Originally part of https://github.com/NikolaiT/GoogleScraper.
1052-
1053-
Only for testing purposes: May be called directly with an search engine
1050+
1051+
Only for testing purposes: May be called directly with an search engine
10541052
search url. For example:
1055-
1053+
10561054
python3 parsing.py 'http://yandex.ru/yandsearch?text=GoogleScraper&lr=178&csg=82%2C4317%2C20%2C20%2C0%2C0%2C0'
1057-
1055+
10581056
Please note: Using this module directly makes little sense, because requesting such urls
10591057
directly without imitating a real browser (which is done in my GoogleScraper module) makes
10601058
the search engines return crippled html, which makes it impossible to parse.

GoogleScraper/search_engine_parameters.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
# current geographic location.
7474
'safe': 'off', # Turns the adult content filter on or off
7575
'rls': None,
76-
#Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official'
76+
# Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official'
7777
'sa': None,
7878
# User search behavior parameter sa=N: User searched, sa=X: User clicked on related searches in the SERP
7979
'source': None, # Google navigational parameter specifying where you came from, univ: universal search
@@ -117,8 +117,8 @@
117117
'oe': 'UTF-8', # Sets the character encoding that is used to encode the results.
118118
'ip': None,
119119
# When queries are made using the HTTP protocol, the ip parameter contains the IP address of the user
120-
#who submitted the search query. You do not supply this parameter with the search request. The ip
121-
#parameter is returned in the XML search results. For example:
120+
# who submitted the search query. You do not supply this parameter with the search request. The ip
121+
# parameter is returned in the XML search results. For example:
122122
'sitesearch': None,
123123
# Limits search results to documents in the specified domain, host, or web directory. Has no effect if the q
124124
# parameter is empty. This parameter has the same effect as the site special query term.
@@ -147,19 +147,19 @@
147147
# ft are: 'i': filetype and 'e': -filetype
148148
'as_lq': None,
149149
# Specifies a URL, and causes search results to show pages that link to the that URL. This parameter has
150-
#the same effect as the link special query term (see “Back Links” on page 20). No other query terms can
151-
#be used when using this parameter.
150+
# the same effect as the link special query term (see “Back Links” on page 20). No other query terms can
151+
# be used when using this parameter.
152152
'as_occt': None,
153153
# Specifies where the search engine is to look for the query terms on the page: anywhere on the page, in
154-
#the title, or in the URL.
154+
# the title, or in the URL.
155155
'as_oq': None,
156156
# Combines the specified terms to the search query in parameter q, with an OR operation. This parameter
157157
# has the same effect as the OR special query term (see “Boolean OR Search” on page 20).
158158
'as_q': None, # Adds the specified query terms to the query terms in parameter q.
159159
'as_sitesearch': None,
160160
# Limits search results to documents in the specified domain, host or web directory, or excludes results
161-
#from the specified location, depending on the value of as_dt. This parameter has the same effect as the
162-
#site or -site special query terms. It has no effect if the q parameter is empty.
161+
# from the specified location, depending on the value of as_dt. This parameter has the same effect as the
162+
# site or -site special query terms. It has no effect if the q parameter is empty.
163163
'entqr': None, # This parameter sets the query expansion policy according to the following valid values:
164164
# 0: None
165165
# 1: Standard Uses only the search appliance’s synonym file.
@@ -182,7 +182,7 @@
182182
183183
"""
184184
bing_search_params = {
185-
185+
'adlt': 'off'
186186
}
187187

188188
"""

GoogleScraper/selenium_mode.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
5858
'google': '#pnnext',
5959
'yandex': '.pager__button_kind_next',
6060
'bing': '.sb_pagN',
61-
'yahoo': '#pg-next',
61+
'yahoo': '.compPagination .next',
6262
'baidu': '.n',
6363
'ask': '#paging div a.txt3.l_nu',
6464
'blekko': '',
@@ -301,7 +301,7 @@ def handle_request_denied(self, status_code):
301301

302302
if self.config.get('manual_captcha_solving', False):
303303
with self.captcha_lock:
304-
import tempfile
304+
# import tempfile
305305

306306
tf = tempfile.NamedTemporaryFile('wb')
307307
tf.write(self.webdriver.get_screenshot_as_png())
@@ -450,15 +450,18 @@ def _find_next_page_element(self):
450450
try:
451451
# wait until the next page link is clickable
452452
WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
453-
except (WebDriverException, TimeoutException) as e:
453+
except (WebDriverException, TimeoutException):
454454
self._save_debug_screenshot()
455-
raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e)))
455+
# raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e)))
456456

457457
return self.webdriver.find_element_by_css_selector(selector)
458458

459459
elif self.search_type == 'image':
460460
self.page_down()
461-
return True
461+
if self.search_engine_name == 'google':
462+
return self.webdriver.find_element_by_css_selector('input._kvc')
463+
else:
464+
return True
462465

463466
def wait_until_serp_loaded(self):
464467
"""
@@ -595,17 +598,9 @@ def page_down(self):
595598
Used for next page in image search mode or when the
596599
next results are obtained by scrolling down a page.
597600
"""
598-
js = '''
599-
var w = window,
600-
d = document,
601-
e = d.documentElement,
602-
g = d.getElementsByTagName('body')[0],
603-
y = w.innerHeight|| e.clientHeight|| g.clientHeight;
604-
605-
window.scrollBy(0,y);
606-
return y;
607-
'''
601+
js = 'window.scrollTo(0,document.body.scrollHeight);'
608602

603+
time.sleep(5)
609604
self.webdriver.execute_script(js)
610605

611606
def run(self):

0 commit comments

Comments
 (0)