from django.core.management.base import BaseCommand, CommandError
from BeautifulSoup import BeautifulSoup
import urllib2, re, json
from decimal import *
WHOLESALERS = {
"Makro" : {
"Groceries": {
"Carbonated Soft Drinks" : "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=58&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=1&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654",
"Confectionary & Beverage" : {
"Snack" : "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=82&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654",
"Confectionery": "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=84&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=1&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654",
}
}
},
"Woolworths" : {
"Food & Household" : {
"Beverages" : {
"Carbonated Drinks" : {
"Cans" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420030&addFacet=9004%3Acat420030&howMany=99999&q_pageNum=1&viewAll=false",
}
},
"Snacks, Sweets & Biscuits" : {
"Chips & Other Snacks" : {
"Chips / Crisps" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420218&addFacet=9004%3Acat420218&howMany=99999&q_pageNum=1&viewAll=false",
"Snack Bars": "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420220&addFacet=9004%3Acat420220&howMany=99999&q_pageNum=1&viewAll=false",
},
"Chocolate Bars & Boxes" : {
"Boxes" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420226&addFacet=9004%3Acat420226&howMany=99999&q_pageNum=1&viewAll=false",
"Chocolate Bars" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420224&addFacet=9004%3Acat420224&howMany=99999&q_pageNum=1&viewAll=false",
},
"Dried Fruit" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200032&addFacet=9004%3Acat200032&howMany=99999&q_pageNum=1&viewAll=false",
"Nuts" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200026&addFacet=9004%3Acat200026&howMany=99999&q_pageNum=1&viewAll=false",
"Popcorn": "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200024&addFacet=9004%3Acat200024&howMany=99999&q_pageNum=1&viewAll=false",
},
}
}
}
class Command(BaseCommand):
args = '<output_file>'
help = 'This command generates a json file which will eventually turn into a fixture file for import into database'
def __init__(self, *args, **kwargs):
super(Command, self).__init__(*args, **kwargs)
self.opener = urllib2.build_opener()
self.opener.addheaders = [('User-agent', 'Mozilla/5.0')]
def parseMakroPage(self, url):
soup = None
import httplib
while soup is None:
# For some reason I kept getting IncompleteRead errors, this fixed it.
try:
page = self.opener.open(url)
soup = BeautifulSoup(page.read())
page.close()
except (httplib.IncompleteRead, httplib.BadStatusLine), err:
from time import sleep
print "Read error occurred, sleeping for 1s then I will try again!"
sleep(1)
products = soup.findAll('table', attrs = { "background" : "/live/images/product_back.gif"})
suffix = "http://www.makro.co.za"
data = []
for product in products:
try:
brand = product.find(attrs={'class' : 'style4'}).contents[0]
# At least one product data entry is broken
if len(brand) > 0:
brand = brand.contents[0].strip()
else:
brand = ""
variation = product.find(attrs={'class' : 'style4'}).contents[1].strip()
sku = str(Decimal(product.find(attrs={"class" : "style20"})['href'].split('Sku=')[1].split('|')[0]))
product_id = product.find(attrs={"class" : "style20"})['href'].split('ProdId=')[1].split('&')[0]
link = "%s/%s" % (suffix, product.find(attrs={'class' : 'style4'})['href'].split('&')[0][1:])
price = product.find(attrs={'class' : 'style5'}).contents[0].strip().split(' ')[1].strip()
except IndexError:
import pdb
pdb.set_trace()
price = str(Decimal(price).quantize(Decimal('.01'), rounding=ROUND_DOWN))
print "%s [%s]:%s - %s R %s (%s)" % (product_id, sku, brand, variation, price, link)
data.append({
'brand' : brand,
'variation' : variation,
'sku' : sku,
'product_id' : product_id,
'link' : link,
'price' : price,
})
return data
def parseWoolworthsPage(self, url):
page = self.opener.open(url)
soup = BeautifulSoup(page.read())
page.close()
products = soup.findAll('div', attrs = { "class" : "itemcontainerWW" })
suffix = "http://www.woolworths.co.za"
data = []
for product in products:
name = product.find(attrs = { "class" : "itemheader" }).a.contents[0].strip()
link = "%s/%s" % (suffix, product.find(attrs = { "class" : "itemheader" }).a['href'][1:])
product_id = link.split('=')[1]
price = product.find(attrs = { "class" : "itemprice_strike" }).contents[0].strip().split(' ')[1].strip()
price = str(Decimal(price).quantize(Decimal('.01'), rounding=ROUND_DOWN))
print "%s: %s R %s (%s)" % (product_id, name, price, link)
data.append({
'name' : name,
'link' : link,
'product_id' : product_id,
'price' : price,
})
return data
def recurseWholesalers(self, obj, parse_callback, categories=[], products = []):
if isinstance(obj, dict):
for k in obj.keys():
new_categories = list(categories)
new_categories.append(k)
self.recurseWholesalers(obj[k], parse_callback, new_categories, products)
else:
newProducts = parse_callback(obj)
for product in newProducts:
product['categories'] = categories
products.extend(newProducts)
def handle(self, *args, **options):
if len(args) == 1:
woolworthsProducts = []
makroProducts = []
self.recurseWholesalers(WHOLESALERS['Woolworths'], self.parseWoolworthsPage, [], woolworthsProducts)
self.recurseWholesalers(WHOLESALERS['Makro'], self.parseMakroPage, [], makroProducts)
products = {
'Woolworths': woolworthsProducts,
'Makro': makroProducts,
}
filename = args[0]
with open(filename, mode='w') as f:
json.dump(products, f, indent=2)
else:
print "You need to specify the output filename"
It was a fun exercise and perhaps it will help someone out there. There are still a couple of difficulties such as identifying the same products at different wholesalers, handling product variations, e.g. flavour, size, etc.
Useful information shared. I am very happy to read this article. Thanks for giving us nice info. Fantastic walk through. I appreciate this post.
ReplyDeleteLaptops for sale
Trade Stocks, Forex, And Bitcoin Anywhere In The World:exness login Is The Leading Provider Of Software That Allows You To Trade On Your Own Terms. Whether You Are Operating In The Forex, Stock, cgin Software And Anonymous Digital Wallet To Connect With The Financial World.: exness login Is A Currency Trading Company That Allows You To Trade Stocks, Forex, And Cryptocurrency.
ReplyDelete