Source code for app

# * import list of asins from file
# * ability to change proxy sources
# * manage ssl errors!! 
# * manage timeouts
# * and manage wait time
# * manage quiet
# * manage destination
# * manage ignore_dups

from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from pprint import pprint
from math import *
import requests
import random
import logging
import argparse
import threading, Queue
import time
import json
import re
import ssl
import os, sys 

from constants import *

[docs]class AmazonScraper(object): """Hello, this is AmazonScraper!""" def __init__(self, **kwargs): default_attr = dict( asin = [], verbose=False, quiet=False, ignore_dups=False, no_reviews = False, no_questions = False, destination='./', save_pages=True ) allowed_attr = list(default_attr.keys()) default_attr.update(kwargs) for key in default_attr: if key in allowed_attr: self.__dict__[key] = kwargs.get(key) self.logger = AmazonScraper.get_logger(level=logging.DEBUG, verbose=default_attr.get('verbose')) # initialize a user agent generator = UserAgent() self.wait_time = 0.2
[docs] def scrape(self): """Manages the whole scraping process""" for asin in self.asin: try:"Examining product " + asin) # collect a list of proxies # this is updated for each product, # I think it would be better to update it each time # too many proxies get banned by amazon self.proxies = self.get_proxies() # collect the reviews url and the total amount # of pages to be scraped main_reviews_url, review_pages_number = self.retrieve_page(asin) if not self.no_reviews: # scrape the reviews reviews, failed_urls = self.retrieve_reviews(main_reviews_url, review_pages_number) reviews_list = list(reviews.queue) failed_urls_list = list(failed_urls.queue) # log the number of results found"found " + str(len(reviews_list)) + " reviews for product " + asin)"failed " + str(len(failed_urls_list)) + " requests for product " + asin) # save the results to file if not os.path.exists('./reviews'): os.makedirs('./reviews') results_file = open("./reviews/" + asin + ".json", 'w+') for i in reviews_list: json.dump(i, results_file) results_file.write('\n') # switch this if true with param about if not self.no_questions: questions, fails = self.retrieve_questions(asin) questions_list = list(questions.queue) fails_questions_list = list(fails.queue)"found " + str(len(questions_list)) + " questions for product " + asin)"failed " + str(len(fails_questions_list)) + " urls for product " + asin) # save questions to file if not os.path.exists('./questions'): os.makedirs('./questions') questions_file = open("./questions/" + asin + ".json", 'w+') for i in questions_list: json.dump(i, questions_file) questions_file.write('\n') except RuntimeError as err: self.logger.warning(err) except KeyboardInterrupt: self.logger.warning('Keyboard interrupt received') sys.exit()
[docs] def retrieve_questions(self, asin): """Scrapes the questions pages and returns a list of dicts containing questions and respecitive answers""" threads = [] results = Queue.Queue() fails = Queue.Queue() for page_num in range(1, 100): t = threading.Thread( target=self.scrape_page_questions, args=(asin, page_num, fails, results) ) t.start() threads.append(t) for thread in threads: thread.join() return results, fails
[docs] def scrape_page_questions(self, asin, page_num, fails, results): """threads to request and scrape a single questions page""" url = base_questions_url + asin + '/' + str(page_num) attempt = 0 while attempt < 10 : try: res = requests.get(url, timeout = 20, proxies = { 'http' : random.sample( self.proxies, 1 )}, headers = { 'User-Agent' :} ) if res.status_code != 200: raise RuntimeError("Server not responding: status code " + str(res.status_code) + " for url " + url) elif BeautifulSoup(res.content, 'html.parser').title.text == "Robot Check": raise RuntimeError("Robot Check not passed for url " + url) else: soup = BeautifulSoup(res.content, 'html.parser') question_boxes = soup.find_all("div", {"class":"a-fixed-left-grid-col a-col-right"}) for j, box in enumerate(question_boxes): q_a_dict = {} if j==0: continue for k, question in enumerate(box.find_all("div", {"class":"a-fixed-left-grid-col a-col-right"})): if k==0: q_a_dict['question'] = question.a.text.strip() for k, answer in enumerate(box.find_all("div", {"class":"a-fixed-left-grid-col a-col-right"})): if k!=0: ranswer = "" if answer.find("span", {"class":"askLongText"}): ranswer = answer.find("span", {"class":"askLongText"}).text ranswer = ranswer.strip()[:-8] else: ranswer = answer.span.text q_a_dict['answer'] = ranswer.strip() if 'answer' in q_a_dict and 'question' in q_a_dict: results.put(q_a_dict) return except requests.exceptions.Timeout: self.logger.debug("Connection timed out for url " + url) except requests.exceptions.RequestException as err: self.logger.debug(err) except RuntimeError as err: self.logger.debug(err) except KeyboardInterrupt: self.logger.warning('Keyboard interrupt received') sys.exit() attempt += 1 self.logger.debug("failed url " + url + " after several attempts") fails.put(url)
[docs] def retrieve_reviews(self, main_reviews_url, review_pages_number): """Collects the review given the url of the reviews page""" results = Queue.Queue() fails = Queue.Queue() threads = [] # craft the single review page url url_parts = (base_amazon_url + main_reviews_url[1:]).strip().split("/") params = url_parts[-1].split('&') for page_num in range(1, review_pages_number): chosen_params = ['ref=cm_cr_dp_d_show_all_btm_'+str(page_num)+'?ie=UTF8', 'pageNumber='+str(page_num)] final_url = "https:/" for i, item in enumerate(url_parts): if i < (len(url_parts)-1) and i>0: final_url += item + '/' for param in chosen_params: final_url += param + '&' final_url = final_url[:-1] t = threading.Thread( target=self.scrape_page_reviews, args=(final_url, fails, results) ) t.start() threads.append(t) for thread in threads: thread.join() return results, fails
[docs] def scrape_page_reviews(self, url, fails, results): """threads to request and scrape a single review page""" attempt = 0 while attempt < 10 : try: res = requests.get(url, timeout = 20, proxies = { 'http' : random.sample( self.proxies, 1 )}, headers = { 'User-Agent' :} ) if res.status_code != 200: raise RuntimeError("Server not responding: status code " + str(res.status_code) + " for url " + url) elif BeautifulSoup(res.content, 'html.parser').title.text == "Robot Check": raise RuntimeError("Robot Check not passed for url " + url) else: soup = BeautifulSoup(res.content, 'html.parser') review_boxes = soup.find_all("div", {"class":"review"}) for box in review_boxes: review = {} review['title'] = box.find("a", {"class":"review-title"}).text review['text'] = box.find("span", {"class":"review-text"}).text review['date'] = box.find("span", {"class":"review-date"}).text review['rating'] = box.find("i", {"class":"review-rating"}).span.text review['author'] = box.find("a", {"class":"author"}).text review['author_id'] = box.find("a", {"class":"author"}).get('href').split('/')[4] results.put(review)"finished scraping " + url) return except requests.exceptions.Timeout: self.logger.debug("Connection timed out for url " + url) except requests.exceptions.RequestException as err: self.logger.debug(err) except RuntimeError as err: self.logger.debug(err) except KeyboardInterrupt: self.logger.warning('Keyboard interrupt received') sys.exit() attempt += 1 self.logger.debug("failed url " + url + " after several attempts") fails.put(url)
[docs] def retrieve_page(self, asin): """Requests the main product page, saves it, and returns an url for the reviews""" attempt = 0 while attempt < 10: # craft a request res = requests.get(base_product_page_url + asin, proxies = { 'http' : random.sample( self.proxies, 1 )}, headers = { 'User-Agent' :} ) # if the asin does not exist, exit. if res.status_code == 404: self.logger.error("Asin " + asin + " does not exist") raise RuntimeError("Asin " + asin + " does not exist") # if the connection fails, try again elif res.status_code != 200: self.logger.error("Connection error on asin " + asin) # if everything goes well, scrape else: soup = BeautifulSoup(res.content, 'html.parser') # eventually amazon discovers us. # in this case, try again with another proxy if soup.title.text == "Robot Check": self.logger.warning("Robot Check received") else: # save the page if the user specified to if self.save_pages: if not os.path.exists('./pages'): os.makedirs('./pages') page_file = open("./pages/" + asin + '.html', 'w+') page_file.write(res.content) reviews_url = soup.find( "div", { "id" : "reviews-medley-footer" } ).find( "a", { "class" : "a-link-emphasis" } ).get("href") if soup.find("div", {"id":"reviews-medley-footer"}) != None: review_pages_number = int(ceil(float(soup.find("div", {"id":"reviews-medley-footer"}).a.text .split("See all ")[1] .split(" customer")[0].replace(",", ""))/10)) else: review_pages_number = 0 return reviews_url, review_pages_number raise RuntimeError("Fetching product " + asin + " failed after several attempts.")
[docs] def get_proxies(self): """Retrieves a list of proxies""" proxies = set() # eventually put this somewhere else proxy_sources = [ '', '', '', '' ] # count times this is executed. stop at 10 attempts. attempt = 0 while not len(proxies) > 0: for source in proxy_sources: res = requests.get(source, headers={ 'User-Agent' }) if res.status_code != 200: self.logger.error("connection error " + str(res.status_code) \ + " source " + source) else: soup = BeautifulSoup(res.content, 'html.parser') tab = soup.find("table", {"id":"proxylisttable"}) for cell in tab.find_all('td'): if cell.string != None and re.match('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', cell.string) != None: proxies.add(cell.string)"found " + str(len(proxies)) + " proxies") if not len(proxies) > 0: attempt += 1 if attempt >= 10: raise requests.ConnectionError("Failed to \ retrieve any proxy after several \ attempts, check your connection status") time.sleep(0.5) else: break return proxies
[docs] def get_logger(level=logging.DEBUG, verbose=False): """Returns a logger""" logger = logging.getLogger(__name__) fh = logging.FileHandler('scrape.log', 'wa') fh.setFormatter( logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ) fh.setLevel(level) logger.addHandler(fh) sh = logging.StreamHandler(sys.stdout) sh.setFormatter( logging.Formatter('%(levelname)s: %(message)s') ) if verbose: sh.setLevel(logging.DEBUG) else: sh.setLevel(logging.ERROR) logger.addHandler(sh) logger.setLevel(level) return logger
[docs] def parse_asins_from_file(path): """Reads a list of asins from a file""" asins = [] try: file_to_read = open(path, 'r') for line in file_to_read: asins.append(line.strip()) except IOError as err: raise ValueError("File not found " + err) return asins
@staticmethod def parse_args(args): parser = argparse.ArgumentParser( description = "amazon-scraper downloads questions and reviews from amazon products", formatter_class = argparse.RawDescriptionHelpFormatter, fromfile_prefix_chars='@' ) parser.add_argument('asin', help='Amazon asin(s) to be scraped', nargs='*') parser.add_argument('--file', '-f', help='Specify path to list of asins') parser.add_argument('--save-pages', '-p', action='store_true', default=True, help='Saves the main pages scraped') parser.add_argument('--verbose', '-v', action='store_true', default=False, help='Logging verbosity level') parser.add_argument('--no-reviews', action='store_true', default=False, help='Do not scrape reviews') parser.add_argument('--no-questions', action='store_true', default=False, help='Do not scrape questions') parser.add_argument('--destination', '-d', default='./', help="Set a destination folder") parser.add_argument('--ignore-dups', action='store_true', help="Do not consider previous operations") parser.add_argument('--quiet', '-q', default=False, action='store_true', help='Be quiet while scraping') args = parser.parse_args(args) if not args.asin and args.file is None: parser.print_help() raise ValueError('Please provide asin or filename.') elif args.asin and args.file: parser.print_help() raise ValueError('Please provide only one of the following: asin(s) or filename') if args.file: args.asin = AmazonScraper.parse_asins_from_file(args.file) return args
def main(): args = AmazonScraper.parse_args(sys.argv[1:]) scraper = AmazonScraper(**vars(args)) scraper.scrape() if __name__ == '__main__': main()