@@ -30,12 +30,12 @@ class Parser():
3030 """Parses SERP pages.
3131
3232 Each search engine results page (SERP) has a similar layout:
33-
33+
3434 The main search results are usually in a html container element (#main, .results, #leftSide).
35- There might be separate columns for other search results (like ads for example). Then each
35+ There might be separate columns for other search results (like ads for example). Then each
3636 result contains basically a link, a snippet and a description (usually some text on the
3737 target site). It's really astonishing how similar other search engines are to Google.
38-
38+
3939 Each child class (that can actual parse a concrete search engine results page) needs
4040 to specify css selectors for the different search types (Like normal search, news search, video search, ...).
4141
@@ -73,10 +73,10 @@ def __init__(self, config={}, html='', query=''):
7373 """Create new Parser instance and parse all information.
7474
7575 Args:
76- html: The raw html from the search engine search. If not provided, you can parse
76+ html: The raw html from the search engine search. If not provided, you can parse
7777 the data later by calling parse(html) directly.
7878 searchtype: The search type. By default "normal"
79-
79+
8080 Raises:
8181 Assertion error if the subclassed
8282 specific parser cannot handle the the settings.
@@ -109,8 +109,8 @@ def __init__(self, config={}, html='', query=''):
109109
110110 def parse (self , html = None ):
111111 """Public function to start parsing the search engine results.
112-
113- Args:
112+
113+ Args:
114114 html: The raw html data to extract the SERP entries from.
115115 """
116116 if html :
@@ -137,7 +137,7 @@ def _parse_lxml(self, cleaner=None):
137137
138138 def _parse (self , cleaner = None ):
139139 """Internal parse the dom according to the provided css selectors.
140-
140+
141141 Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found.
142142 """
143143 self .num_results = 0
@@ -152,8 +152,7 @@ def _parse(self, cleaner=None):
152152
153153 self .num_results_for_query = self .first_match (num_results_selector , self .dom )
154154 if not self .num_results_for_query :
155- logger .debug ('{}: Cannot parse num_results from serp page with selectors {}' .format (self .__class__ .__name__ ,
156- num_results_selector ))
155+ logger .debug ('{}: Cannot parse num_results from serp page with selectors {}' .format (self .__class__ .__name__ , num_results_selector ))
157156
158157 # get the current page we are at. Sometimes we search engines don't show this.
159158 try :
@@ -180,7 +179,7 @@ def _parse(self, cleaner=None):
180179
181180 self .search_results [result_type ] = []
182181
183- for selector_specific , selectors in selector_class .items ():
182+ for _ , selectors in selector_class .items ():
184183
185184 if 'result_container' in selectors and selectors ['result_container' ]:
186185 css = '{container} {result_container}' .format (** selectors )
@@ -272,14 +271,14 @@ def first_match(self, selectors, element):
272271 match = self .advanced_css (selector , element = element )
273272 if match :
274273 return match
275- except IndexError as e :
274+ except IndexError :
276275 pass
277276
278277 return False
279278
280279 def after_parsing (self ):
281280 """Subclass specific behaviour after parsing happened.
282-
281+
283282 Override in subclass to add search engine specific behaviour.
284283 Commonly used to clean the results.
285284 """
@@ -312,7 +311,7 @@ def iter_serp_items(self):
312311
313312
314313"""
315- Here follow the different classes that provide CSS selectors
314+ Here follow the different classes that provide CSS selectors
316315for different types of SERP pages of several common search engines.
317316
318317Just look at them and add your own selectors in a new class if you
@@ -404,7 +403,7 @@ class GoogleParser(Parser):
404403 image_search_selectors = {
405404 'results' : {
406405 'de_ip' : {
407- 'container' : 'li #isr_mc' ,
406+ 'container' : '#isr_mc' ,
408407 'result_container' : 'div.rg_di' ,
409408 'link' : 'a.rg_l::attr(href)'
410409 },
@@ -422,12 +421,12 @@ def __init__(self, *args, **kwargs):
422421
423422 def after_parsing (self ):
424423 """Clean the urls.
425-
424+
426425 A typical scraped results looks like the following:
427-
426+
428427 '/url?q=http://www.youtube.com/user/Apple&sa=U&ei=\
429428 lntiVN7JDsTfPZCMgKAO&ved=0CFQQFjAO&usg=AFQjCNGkX65O-hKLmyq1FX9HQqbb9iYn9A'
430-
429+
431430 Clean with a short regex.
432431 """
433432 super ().after_parsing ()
@@ -543,11 +542,10 @@ def after_parsing(self):
543542 try :
544543 i = self .html .index (substr )
545544 if i :
546- self .num_results_for_query = re .search (r'— (.)*?"' , self .html [i :i + len (self .query ) + 150 ]).group ()
545+ self .num_results_for_query = re .search (r'— (.)*?"' , self .html [i :i + len (self .query ) + 150 ]).group ()
547546 except Exception as e :
548547 logger .debug (str (e ))
549548
550-
551549 if self .searchtype == 'image' :
552550 for key , i in self .iter_serp_items ():
553551 for regex in (
@@ -626,7 +624,7 @@ class BingParser(Parser):
626624 'ch_ip' : {
627625 'container' : '#dg_c .imgres' ,
628626 'result_container' : '.dg_u' ,
629- 'link' : 'a.dv_i ::attr(m)'
627+ 'link' : 'a::attr(m)'
630628 },
631629 }
632630 }
@@ -1049,12 +1047,12 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None,
10491047
10501048if __name__ == '__main__' :
10511049 """Originally part of https://github.com/NikolaiT/GoogleScraper.
1052-
1053- Only for testing purposes: May be called directly with an search engine
1050+
1051+ Only for testing purposes: May be called directly with an search engine
10541052 search url. For example:
1055-
1053+
10561054 python3 parsing.py 'http://yandex.ru/yandsearch?text=GoogleScraper&lr=178&csg=82%2C4317%2C20%2C20%2C0%2C0%2C0'
1057-
1055+
10581056 Please note: Using this module directly makes little sense, because requesting such urls
10591057 directly without imitating a real browser (which is done in my GoogleScraper module) makes
10601058 the search engines return crippled html, which makes it impossible to parse.
0 commit comments