scrape item image from aliexpress
This commit is contained in:
parent
ab4ab81e19
commit
2ed92e5412
@ -22,7 +22,7 @@ def check_items(settings_items):
|
|||||||
|
|
||||||
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9]*#[a-zA-Z0-9 ]*;?)*)\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9]*#[a-zA-Z0-9 ]*;?)*)\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
||||||
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
||||||
#shipping_cost_regex = re.compile(r'')
|
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
cookies_file_path = './cookies.json'
|
cookies_file_path = './cookies.json'
|
||||||
@ -43,8 +43,18 @@ def check_items(settings_items):
|
|||||||
discount = 0 if len(elem[5]) == 0 else int(elem[5])
|
discount = 0 if len(elem[5]) == 0 else int(elem[5])
|
||||||
price = float(elem[13]) if len(elem[11]) == 0 else float(elem[11])
|
price = float(elem[13]) if len(elem[11]) == 0 else float(elem[11])
|
||||||
|
|
||||||
# skuId, quantity, discount_percentage, price, currency, choice_delivery
|
# get item image
|
||||||
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice}
|
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
|
||||||
|
for attr in filter_attributes:
|
||||||
|
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
|
||||||
|
image = re.findall(image_regex, target_page_response.text)
|
||||||
|
if len(image)>0:
|
||||||
|
image_link = image[0]
|
||||||
|
image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview
|
||||||
|
break
|
||||||
|
|
||||||
|
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
||||||
|
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link}
|
||||||
else:
|
else:
|
||||||
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
|
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
|
||||||
return extract
|
return extract
|
||||||
@ -60,7 +70,7 @@ def get_attributes(attributes_raw):
|
|||||||
|
|
||||||
def fill_db(db_settings, items_dict):
|
def fill_db(db_settings, items_dict):
|
||||||
for key,value in items_dict.items():
|
for key,value in items_dict.items():
|
||||||
add_history_entry(db_settings, key[0], value["skuid"], value["choice_delivery"], list(key[1]), value["price"], value["currency"], value["quantity"], value["discount_percentage"])
|
add_history_entry(db_settings, key[0], value["skuid"], value["choice_delivery"], list(key[1]), value["image"], value["price"], value["currency"], value["quantity"], value["discount_percentage"])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
15
db.py
15
db.py
@ -13,18 +13,18 @@ def connect_db(db_settings):
|
|||||||
print(error)
|
print(error)
|
||||||
return conn
|
return conn
|
||||||
|
|
||||||
def add_item(db_settings, itemid, skuid, choice, attributes):
|
def add_item(db_settings, itemid, skuid, choice, attributes, image):
|
||||||
connection = connect_db(db_settings)
|
connection = connect_db(db_settings)
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO item (itemid, skuid, choice, attributes)
|
INSERT INTO item (itemid, skuid, choice, attributes, image)
|
||||||
VALUES (%s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s)
|
||||||
""", (itemid, skuid, choice, attributes))
|
""", (itemid, skuid, choice, attributes, image))
|
||||||
connection.commit()
|
connection.commit()
|
||||||
connection.close()
|
connection.close()
|
||||||
|
|
||||||
def add_history_entry(db_settings, itemid, skuid, choice, attributes, price, currency, quantity, discount_percentage):
|
def add_history_entry(db_settings, itemid, skuid, choice, attributes, image, price, currency, quantity, discount_percentage):
|
||||||
connection = connect_db(db_settings)
|
connection = connect_db(db_settings)
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ def add_history_entry(db_settings, itemid, skuid, choice, attributes, price, cur
|
|||||||
""", (itemid, skuid))
|
""", (itemid, skuid))
|
||||||
|
|
||||||
if cursor.rowcount == 0:
|
if cursor.rowcount == 0:
|
||||||
add_item(db_settings, itemid, skuid, choice, attributes)
|
add_item(db_settings, itemid, skuid, choice, attributes, image)
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO history (itemid, skuid, price, currency, quantity, discount_percentage, h_timestamp)
|
INSERT INTO history (itemid, skuid, price, currency, quantity, discount_percentage, h_timestamp)
|
||||||
@ -51,7 +51,7 @@ def export_csv(db_settings):
|
|||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
SELECT i.itemid, i.skuid, i.choice, i.attributes, h.quantity, h.discount_percentage, h.price, h.currency, h.h_timestamp
|
SELECT i.itemid, i.skuid, i.choice, i.attributes, i.image, h.quantity, h.discount_percentage, h.price, h.currency, h.h_timestamp
|
||||||
FROM item i, history h
|
FROM item i, history h
|
||||||
WHERE i.itemid = h.itemid and i.skuid = h.skuid
|
WHERE i.itemid = h.itemid and i.skuid = h.skuid
|
||||||
""")
|
""")
|
||||||
@ -84,6 +84,7 @@ def initialize(db_settings):
|
|||||||
skuid bigint,
|
skuid bigint,
|
||||||
choice boolean,
|
choice boolean,
|
||||||
attributes text[],
|
attributes text[],
|
||||||
|
image text,
|
||||||
primary key (itemid,skuid)
|
primary key (itemid,skuid)
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
@ -22,6 +22,7 @@ d3.csv('http://127.0.0.1/output.csv', d => {
|
|||||||
value: parseFloat(d.price.replace('$', '')),
|
value: parseFloat(d.price.replace('$', '')),
|
||||||
skuid: d.skuid,
|
skuid: d.skuid,
|
||||||
itemid: d.itemid,
|
itemid: d.itemid,
|
||||||
|
image: d.image,
|
||||||
currency: d.currency
|
currency: d.currency
|
||||||
}
|
}
|
||||||
}).then(function(data) {
|
}).then(function(data) {
|
||||||
@ -65,7 +66,7 @@ d3.csv('http://127.0.0.1/output.csv', d => {
|
|||||||
.attr("y", height*0.1)
|
.attr("y", height*0.1)
|
||||||
.attr("width", height*0.8)
|
.attr("width", height*0.8)
|
||||||
.attr("height", height*0.8)
|
.attr("height", height*0.8)
|
||||||
.attr("xlink:href", "https://en.wikipedia.org/static/images/icons/wikipedia.png") // placeholder picture for now, should be item picture
|
.attr("xlink:href", dataSubset[0].image) // placeholder picture for now, should be item picture
|
||||||
.on("click", function() { window.open(link); });
|
.on("click", function() { window.open(link); });
|
||||||
|
|
||||||
// Price domain (height)
|
// Price domain (height)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user