Fixes NikolaiT#149 plus some styles

Ronald Schmidt · Ronald Schmidt · commit f51b71f8a11c · 2017-02-21T16:33:43.000+01:00
diff --git a/GoogleScraper/core.py b/GoogleScraper/core.py
@@ -27,6 +27,7 @@
 class WrongConfigurationError(Exception):
     pass
 
+
 def id_for_keywords(keywords):
     """Determine a unique id for the keywords.
 
@@ -97,7 +98,8 @@ def start_python_console(namespace=None, noipython=False, banner=''):
             except ImportError:
                 pass
             else:
-                import rlcompleter
+                pass
+                # import rlcompleter
 
                 readline.parse_and_bind("tab:complete")
             code.interact(banner=banner, local=namespace)
@@ -202,7 +204,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
     proxy_db = config.get('mysql_proxy_db', '')
 
     # when no search engine is specified, use google
-    search_engines = config.get('search_engines', ['google',])
+    search_engines = config.get('search_engines', ['google'])
     if not isinstance(search_engines, list):
         if search_engines == '*':
             search_engines = config.get('supported_search_engines')
@@ -238,8 +240,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
     if not (keyword or keywords) and not kwfile:
         # Just print the help.
         get_command_line(True)
-        print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
-            'keyword with --keyword.')
+        print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.')
         return
 
     cache_manager = CacheManager(config)
@@ -456,4 +457,4 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
     session.commit()
 
     if return_results:
-        return scraper_search
+        return session
diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py
@@ -30,12 +30,12 @@ class Parser():
     """Parses SERP pages.
 
     Each search engine results page (SERP) has a similar layout:
-    
+
     The main search results are usually in a html container element (#main, .results, #leftSide).
-    There might be separate columns for other search results (like ads for example). Then each 
+    There might be separate columns for other search results (like ads for example). Then each
     result contains basically a link, a snippet and a description (usually some text on the
     target site). It's really astonishing how similar other search engines are to Google.
-    
+
     Each child class (that can actual parse a concrete search engine results page) needs
     to specify css selectors for the different search types (Like normal search, news search, video search, ...).
 
@@ -73,10 +73,10 @@ def __init__(self, config={}, html='', query=''):
         """Create new Parser instance and parse all information.
 
         Args:
-            html: The raw html from the search engine search. If not provided, you can parse 
+            html: The raw html from the search engine search. If not provided, you can parse
                     the data later by calling parse(html) directly.
             searchtype: The search type. By default "normal"
-            
+
         Raises:
             Assertion error if the subclassed
             specific parser cannot handle the the settings.
@@ -109,8 +109,8 @@ def __init__(self, config={}, html='', query=''):
 
     def parse(self, html=None):
         """Public function to start parsing the search engine results.
-        
-        Args: 
+
+        Args:
             html: The raw html data to extract the SERP entries from.
         """
         if html:
@@ -137,7 +137,7 @@ def _parse_lxml(self, cleaner=None):
 
     def _parse(self, cleaner=None):
         """Internal parse the dom according to the provided css selectors.
-        
+
         Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found.
         """
         self.num_results = 0
@@ -152,8 +152,7 @@ def _parse(self, cleaner=None):
 
         self.num_results_for_query = self.first_match(num_results_selector, self.dom)
         if not self.num_results_for_query:
-            logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__,
-                                                                                       num_results_selector))
+            logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__, num_results_selector))
 
         # get the current page we are at. Sometimes we search engines don't show this.
         try:
@@ -180,7 +179,7 @@ def _parse(self, cleaner=None):
 
             self.search_results[result_type] = []
 
-            for selector_specific, selectors in selector_class.items():
+            for _, selectors in selector_class.items():
 
                 if 'result_container' in selectors and selectors['result_container']:
                     css = '{container} {result_container}'.format(**selectors)
@@ -272,14 +271,14 @@ def first_match(self, selectors, element):
                     match = self.advanced_css(selector, element=element)
                     if match:
                         return match
-                except IndexError as e:
+                except IndexError:
                     pass
 
         return False
 
     def after_parsing(self):
         """Subclass specific behaviour after parsing happened.
-        
+
         Override in subclass to add search engine specific behaviour.
         Commonly used to clean the results.
         """
@@ -312,7 +311,7 @@ def iter_serp_items(self):
 
 
 """
-Here follow the different classes that provide CSS selectors 
+Here follow the different classes that provide CSS selectors
 for different types of SERP pages of several common search engines.
 
 Just look at them and add your own selectors in a new class if you
@@ -404,7 +403,7 @@ class GoogleParser(Parser):
     image_search_selectors = {
         'results': {
             'de_ip': {
-                'container': 'li#isr_mc',
+                'container': '#isr_mc',
                 'result_container': 'div.rg_di',
                 'link': 'a.rg_l::attr(href)'
             },
@@ -422,12 +421,12 @@ def __init__(self, *args, **kwargs):
 
     def after_parsing(self):
         """Clean the urls.
-        
+
         A typical scraped results looks like the following:
-        
+
         '/url?q=http://www.youtube.com/user/Apple&sa=U&ei=\
         lntiVN7JDsTfPZCMgKAO&ved=0CFQQFjAO&usg=AFQjCNGkX65O-hKLmyq1FX9HQqbb9iYn9A'
-        
+
         Clean with a short regex.
         """
         super().after_parsing()
@@ -543,11 +542,10 @@ def after_parsing(self):
                 try:
                     i = self.html.index(substr)
                     if i:
-                        self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i+len(self.query) + 150]).group()
+                        self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i + len(self.query) + 150]).group()
                 except Exception as e:
                     logger.debug(str(e))
 
-
         if self.searchtype == 'image':
             for key, i in self.iter_serp_items():
                 for regex in (
@@ -626,7 +624,7 @@ class BingParser(Parser):
             'ch_ip': {
                 'container': '#dg_c .imgres',
                 'result_container': '.dg_u',
-                'link': 'a.dv_i::attr(m)'
+                'link': 'a::attr(m)'
             },
         }
     }
@@ -1049,12 +1047,12 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None,
 
 if __name__ == '__main__':
     """Originally part of https://github.com/NikolaiT/GoogleScraper.
-    
-    Only for testing purposes: May be called directly with an search engine 
+
+    Only for testing purposes: May be called directly with an search engine
     search url. For example:
-    
+
     python3 parsing.py 'http://yandex.ru/yandsearch?text=GoogleScraper&lr=178&csg=82%2C4317%2C20%2C20%2C0%2C0%2C0'
-    
+
     Please note: Using this module directly makes little sense, because requesting such urls
     directly without imitating a real browser (which is done in my GoogleScraper module) makes
     the search engines return crippled html, which makes it impossible to parse.
diff --git a/GoogleScraper/search_engine_parameters.py b/GoogleScraper/search_engine_parameters.py
@@ -73,7 +73,7 @@
     # current geographic location.
     'safe': 'off',  # Turns the adult content filter on or off
     'rls': None,
-    #Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official'
+    # Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official'
     'sa': None,
     # User search behavior parameter sa=N: User searched, sa=X: User clicked on related searches in the SERP
     'source': None,  # Google navigational parameter specifying where you came from, univ: universal search
@@ -117,8 +117,8 @@
     'oe': 'UTF-8',  # Sets the character encoding that is used to encode the results.
     'ip': None,
     # When queries are made using the HTTP protocol, the ip parameter contains the IP address of the user
-    #who submitted the search query. You do not supply this parameter with the search request. The ip
-    #parameter is returned in the XML search results. For example:
+    # who submitted the search query. You do not supply this parameter with the search request. The ip
+    # parameter is returned in the XML search results. For example:
     'sitesearch': None,
     # Limits search results to documents in the specified domain, host, or web directory. Has no effect if the q
     # parameter is empty. This parameter has the same effect as the site special query term.
@@ -147,19 +147,19 @@
     # ft are: 'i': filetype and 'e': -filetype
     'as_lq': None,
     # Specifies a URL, and causes search results to show pages that link to the that URL. This parameter has
-    #the same effect as the link special query term (see “Back Links” on page 20). No other query terms can
-    #be used when using this parameter.
+    # the same effect as the link special query term (see “Back Links” on page 20). No other query terms can
+    # be used when using this parameter.
     'as_occt': None,
     # Specifies where the search engine is to look for the query terms on the page: anywhere on the page, in
-    #the title, or in the URL.
+    # the title, or in the URL.
     'as_oq': None,
     # Combines the specified terms to the search query in parameter q, with an OR operation. This parameter
     # has the same effect as the OR special query term (see “Boolean OR Search” on page 20).
     'as_q': None,  # Adds the specified query terms to the query terms in parameter q.
     'as_sitesearch': None,
     # Limits search results to documents in the specified domain, host or web directory, or excludes results
-    #from the specified location, depending on the value of as_dt. This parameter has the same effect as the
-    #site or -site special query terms. It has no effect if the q parameter is empty.
+    # from the specified location, depending on the value of as_dt. This parameter has the same effect as the
+    # site or -site special query terms. It has no effect if the q parameter is empty.
     'entqr': None,  # This parameter sets the query expansion policy according to the following valid values:
     # 0: None
     # 1: Standard Uses only the search appliance’s synonym file.
@@ -182,7 +182,7 @@
 
 """
 bing_search_params = {
-
+    'adlt': 'off'
 }
 
 """
diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py
@@ -58,7 +58,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'google': '#pnnext',
         'yandex': '.pager__button_kind_next',
         'bing': '.sb_pagN',
-        'yahoo': '#pg-next',
+        'yahoo': '.compPagination .next',
         'baidu': '.n',
         'ask': '#paging div a.txt3.l_nu',
         'blekko': '',
@@ -301,7 +301,7 @@ def handle_request_denied(self, status_code):
 
             if self.config.get('manual_captcha_solving', False):
                 with self.captcha_lock:
-                    import tempfile
+                    # import tempfile
 
                     tf = tempfile.NamedTemporaryFile('wb')
                     tf.write(self.webdriver.get_screenshot_as_png())
@@ -450,15 +450,18 @@ def _find_next_page_element(self):
             try:
                 # wait until the next page link is clickable
                 WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
-            except (WebDriverException, TimeoutException) as e:
+            except (WebDriverException, TimeoutException):
                 self._save_debug_screenshot()
-                raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e)))
+                # raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e)))
 
             return self.webdriver.find_element_by_css_selector(selector)
 
         elif self.search_type == 'image':
             self.page_down()
-            return True
+            if self.search_engine_name == 'google':
+                return self.webdriver.find_element_by_css_selector('input._kvc')
+            else:
+                return True
 
     def wait_until_serp_loaded(self):
         """
@@ -595,17 +598,9 @@ def page_down(self):
         Used for next page in image search mode or when the
         next results are obtained by scrolling down a page.
         """
-        js = '''
-        var w = window,
-            d = document,
-            e = d.documentElement,
-            g = d.getElementsByTagName('body')[0],
-            y = w.innerHeight|| e.clientHeight|| g.clientHeight;
-
-        window.scrollBy(0,y);
-        return y;
-        '''
+        js = 'window.scrollTo(0,document.body.scrollHeight);'
 
+        time.sleep(5)
         self.webdriver.execute_script(js)
 
     def run(self):