2023-12-28 13:38:21 +01:00
|
|
|
#!/usr/bin/python
|
|
|
|
import requests, re, json, os, yaml
|
2023-12-28 17:39:36 +01:00
|
|
|
from db import *
|
2023-12-28 13:38:21 +01:00
|
|
|
|
|
|
|
def load_cookies_from_file(file_path):
|
2024-01-27 00:57:32 +01:00
|
|
|
'''load cookies from a file and return a dict usable in a request session'''
|
2023-12-28 13:38:21 +01:00
|
|
|
with open(file_path, 'r') as file:
|
|
|
|
cookies_data = json.load(file)
|
|
|
|
|
|
|
|
cookies_dict = {}
|
|
|
|
|
|
|
|
for cookie_data in cookies_data:
|
|
|
|
name_raw = cookie_data.get("Name raw", "")
|
|
|
|
content_raw = cookie_data.get("Content raw", "")
|
|
|
|
cookie_value = f"{content_raw}"
|
|
|
|
|
|
|
|
if len(cookie_value) > 0:
|
|
|
|
cookies_dict[name_raw] = cookie_value
|
|
|
|
|
|
|
|
return cookies_dict
|
|
|
|
|
2023-12-28 17:39:36 +01:00
|
|
|
def check_items(settings_items):
|
2024-01-27 00:57:32 +01:00
|
|
|
'''
|
|
|
|
return a dict with items data extracted from aliexpress.
|
|
|
|
a file containing aliexpress login token cookies has to be provided in ./cookies.json (obtained with cookie-quick-manager https://github.com/ysard/cookie-quick-manager/) for accurate prices (no "welcome discount")
|
|
|
|
extracted data:
|
|
|
|
skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
|
|
|
parameter settings_item is a list of tables (string(itemid), attributes)
|
|
|
|
itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter.
|
|
|
|
'''
|
2023-12-28 13:38:21 +01:00
|
|
|
|
2024-01-29 00:42:20 +01:00
|
|
|
item_regex = re.compile(r'skuAttr\\\":\\\"([0-9:;]*#([a-zA-Z0-9 ]*;?)*)\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"(bulkOrder)\\\":[0-9]*,)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
2023-12-28 13:38:21 +01:00
|
|
|
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
2024-01-24 14:44:08 +01:00
|
|
|
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
|
2023-12-28 13:38:21 +01:00
|
|
|
|
|
|
|
session = requests.Session()
|
2024-01-28 01:20:50 +01:00
|
|
|
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
|
2023-12-28 13:38:21 +01:00
|
|
|
cookies = load_cookies_from_file(cookies_file_path)
|
|
|
|
session.cookies.update(cookies)
|
|
|
|
|
|
|
|
extract = dict()
|
|
|
|
|
2023-12-28 17:39:36 +01:00
|
|
|
for (item,filter_attributes) in settings_items:
|
2023-12-28 13:38:21 +01:00
|
|
|
url = 'https://aliexpress.com/item/'+item+'.html'
|
|
|
|
target_page_response = session.get(url)
|
|
|
|
if target_page_response.status_code == 200:
|
|
|
|
content = re.findall(item_regex, target_page_response.text)
|
|
|
|
is_choice = bool(re.search(choice_regex, target_page_response.text))
|
|
|
|
for elem in content:
|
2023-12-28 17:39:36 +01:00
|
|
|
if set(get_attributes(elem[0])) == set(filter_attributes):
|
|
|
|
key = (item,tuple(filter_attributes))
|
2024-01-29 00:42:20 +01:00
|
|
|
discount = 0 if len(elem[7]) == 0 else int(elem[7])
|
|
|
|
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
|
2023-12-28 13:38:21 +01:00
|
|
|
|
2024-01-24 14:44:08 +01:00
|
|
|
# get item image
|
|
|
|
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
|
|
|
|
for attr in filter_attributes:
|
|
|
|
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
|
|
|
|
image = re.findall(image_regex, target_page_response.text)
|
|
|
|
if len(image)>0:
|
|
|
|
image_link = image[0]
|
|
|
|
image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview
|
|
|
|
break
|
|
|
|
|
|
|
|
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
|
|
|
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link}
|
2023-12-28 13:38:21 +01:00
|
|
|
else:
|
|
|
|
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
|
2023-12-28 17:39:36 +01:00
|
|
|
return extract
|
|
|
|
|
|
|
|
def get_attributes(attributes_raw):
|
2024-01-27 00:57:32 +01:00
|
|
|
'''return a list of attributes from attributes raw string'''
|
2023-12-28 17:39:36 +01:00
|
|
|
# id_regex = re.compile(r'([0-9]*)=')
|
|
|
|
attr_regex = re.compile(r'#([0-9a-zA-Z ]*)')
|
|
|
|
|
|
|
|
# item_id = re.search(id_regex, attributes_raw).group(1)
|
|
|
|
attributes = re.findall(attr_regex, attributes_raw)
|
|
|
|
|
|
|
|
return attributes
|
|
|
|
|
|
|
|
def fill_db(db_settings, items_dict):
|
2024-01-27 00:57:32 +01:00
|
|
|
'''add new history entries in database with data extracted in item_dict'''
|
2023-12-28 17:39:36 +01:00
|
|
|
for key,value in items_dict.items():
|
2024-01-24 14:44:08 +01:00
|
|
|
add_history_entry(db_settings, key[0], value["skuid"], value["choice_delivery"], list(key[1]), value["image"], value["price"], value["currency"], value["quantity"], value["discount_percentage"])
|
2023-12-28 17:39:36 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|