from django.core.management.base import BaseCommand, CommandError from BeautifulSoup import BeautifulSoup import urllib2, re, json from decimal import * WHOLESALERS = { "Makro" : { "Groceries": { "Carbonated Soft Drinks" : "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=58&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=1&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654", "Confectionary & Beverage" : { "Snack" : "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=82&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654", "Confectionery": "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=84&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=1&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654", } } }, "Woolworths" : { "Food & Household" : { "Beverages" : { "Carbonated Drinks" : { "Cans" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420030&addFacet=9004%3Acat420030&howMany=99999&q_pageNum=1&viewAll=false", } }, "Snacks, Sweets & Biscuits" : { "Chips & Other Snacks" : { "Chips / Crisps" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420218&addFacet=9004%3Acat420218&howMany=99999&q_pageNum=1&viewAll=false", "Snack Bars": "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420220&addFacet=9004%3Acat420220&howMany=99999&q_pageNum=1&viewAll=false", }, "Chocolate Bars & Boxes" : { "Boxes" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420226&addFacet=9004%3Acat420226&howMany=99999&q_pageNum=1&viewAll=false", "Chocolate Bars" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420224&addFacet=9004%3Acat420224&howMany=99999&q_pageNum=1&viewAll=false", }, "Dried Fruit" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200032&addFacet=9004%3Acat200032&howMany=99999&q_pageNum=1&viewAll=false", "Nuts" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200026&addFacet=9004%3Acat200026&howMany=99999&q_pageNum=1&viewAll=false", "Popcorn": "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200024&addFacet=9004%3Acat200024&howMany=99999&q_pageNum=1&viewAll=false", }, } } } class Command(BaseCommand): args = '<output_file>' help = 'This command generates a json file which will eventually turn into a fixture file for import into database' def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.opener = urllib2.build_opener() self.opener.addheaders = [('User-agent', 'Mozilla/5.0')] def parseMakroPage(self, url): soup = None import httplib while soup is None: # For some reason I kept getting IncompleteRead errors, this fixed it. try: page = self.opener.open(url) soup = BeautifulSoup(page.read()) page.close() except (httplib.IncompleteRead, httplib.BadStatusLine), err: from time import sleep print "Read error occurred, sleeping for 1s then I will try again!" sleep(1) products = soup.findAll('table', attrs = { "background" : "/live/images/product_back.gif"}) suffix = "http://www.makro.co.za" data = [] for product in products: try: brand = product.find(attrs={'class' : 'style4'}).contents[0] # At least one product data entry is broken if len(brand) > 0: brand = brand.contents[0].strip() else: brand = "" variation = product.find(attrs={'class' : 'style4'}).contents[1].strip() sku = str(Decimal(product.find(attrs={"class" : "style20"})['href'].split('Sku=')[1].split('|')[0])) product_id = product.find(attrs={"class" : "style20"})['href'].split('ProdId=')[1].split('&')[0] link = "%s/%s" % (suffix, product.find(attrs={'class' : 'style4'})['href'].split('&')[0][1:]) price = product.find(attrs={'class' : 'style5'}).contents[0].strip().split(' ')[1].strip() except IndexError: import pdb pdb.set_trace() price = str(Decimal(price).quantize(Decimal('.01'), rounding=ROUND_DOWN)) print "%s [%s]:%s - %s R %s (%s)" % (product_id, sku, brand, variation, price, link) data.append({ 'brand' : brand, 'variation' : variation, 'sku' : sku, 'product_id' : product_id, 'link' : link, 'price' : price, }) return data def parseWoolworthsPage(self, url): page = self.opener.open(url) soup = BeautifulSoup(page.read()) page.close() products = soup.findAll('div', attrs = { "class" : "itemcontainerWW" }) suffix = "http://www.woolworths.co.za" data = [] for product in products: name = product.find(attrs = { "class" : "itemheader" }).a.contents[0].strip() link = "%s/%s" % (suffix, product.find(attrs = { "class" : "itemheader" }).a['href'][1:]) product_id = link.split('=')[1] price = product.find(attrs = { "class" : "itemprice_strike" }).contents[0].strip().split(' ')[1].strip() price = str(Decimal(price).quantize(Decimal('.01'), rounding=ROUND_DOWN)) print "%s: %s R %s (%s)" % (product_id, name, price, link) data.append({ 'name' : name, 'link' : link, 'product_id' : product_id, 'price' : price, }) return data def recurseWholesalers(self, obj, parse_callback, categories=[], products = []): if isinstance(obj, dict): for k in obj.keys(): new_categories = list(categories) new_categories.append(k) self.recurseWholesalers(obj[k], parse_callback, new_categories, products) else: newProducts = parse_callback(obj) for product in newProducts: product['categories'] = categories products.extend(newProducts) def handle(self, *args, **options): if len(args) == 1: woolworthsProducts = [] makroProducts = [] self.recurseWholesalers(WHOLESALERS['Woolworths'], self.parseWoolworthsPage, [], woolworthsProducts) self.recurseWholesalers(WHOLESALERS['Makro'], self.parseMakroPage, [], makroProducts) products = { 'Woolworths': woolworthsProducts, 'Makro': makroProducts, } filename = args[0] with open(filename, mode='w') as f: json.dump(products, f, indent=2) else: print "You need to specify the output filename"
It was a fun exercise and perhaps it will help someone out there. There are still a couple of difficulties such as identifying the same products at different wholesalers, handling product variations, e.g. flavour, size, etc.