From c7a4bc711dbe6e85d2b0a5f03e9adfb34705c0bd Mon Sep 17 00:00:00 2001 From: Sam Hadow Date: Wed, 31 Jan 2024 21:56:16 +0100 Subject: [PATCH] regex catastrophic backtracking fix + background job update db --- CronDockerfile | 20 ++++++++++++++ Cronrequirements.txt | 1 + podman-commands | 4 ++- src/aliexpress.py | 63 ++++++++++++++++++++++++-------------------- src/app.py | 4 +-- src/background.py | 27 +++++++++++++++++-- src/db.py | 13 ++++----- src/main.py | 25 ++++++++---------- 8 files changed, 103 insertions(+), 54 deletions(-) create mode 100644 CronDockerfile create mode 100644 Cronrequirements.txt diff --git a/CronDockerfile b/CronDockerfile new file mode 100644 index 0000000..a37a5d0 --- /dev/null +++ b/CronDockerfile @@ -0,0 +1,20 @@ +FROM python:bookworm + +COPY /Cronrequirements.txt / + +RUN pip3 install --upgrade pip + +RUN pip3 install -r /Cronrequirements.txt + + + +COPY ./src/background.py /app/background.py + +WORKDIR /app + + + + +ENV WAIT_TIME="1d" + +CMD ["python", "background.py"] diff --git a/Cronrequirements.txt b/Cronrequirements.txt new file mode 100644 index 0000000..f229360 --- /dev/null +++ b/Cronrequirements.txt @@ -0,0 +1 @@ +requests diff --git a/podman-commands b/podman-commands index 98f7e5d..0991788 100644 --- a/podman-commands +++ b/podman-commands @@ -1,8 +1,10 @@ podman build --tag alipricetrack:1.0.0 -f ./Dockerfile - +podman build --tag alipricetrack_cron:1.0.0 -f ./CronDockerfile podman pod create --name aliexpress -p 8086:8080 podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" --name ali-app alipricetrack:1.0.0 + +podman run -d --pod=aliexpress --name ali-cron alipricetrack_cron:1.0.0 diff --git a/src/aliexpress.py b/src/aliexpress.py index 6774ca8..c30fb0a 100644 --- a/src/aliexpress.py +++ b/src/aliexpress.py @@ -19,7 +19,7 @@ def load_cookies_from_file(file_path): return cookies_dict -def check_items(settings_items): +def check_item(settings_item): ''' return a dict with items data extracted from aliexpress. a file containing aliexpress login token cookies has to be provided in ./cookies.json (obtained with cookie-quick-manager https://github.com/ysard/cookie-quick-manager/) for accurate prices (no "welcome discount") @@ -29,7 +29,7 @@ def check_items(settings_items): itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter. ''' - item_regex = re.compile(r'skuAttr\\\":\\\"([0-9:;]*#([a-zA-Z0-9 ]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}') + item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#[a-zA-Z0-9 \.]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}') choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"') magnifier_image_regex = re.compile(r'') @@ -40,38 +40,42 @@ def check_items(settings_items): extract = dict() - for (item,filter_attributes) in settings_items: - url = 'https://aliexpress.com/item/'+item+'.html' - target_page_response = session.get(url) - if target_page_response.status_code == 200: - content = re.findall(item_regex, target_page_response.text) - is_choice = bool(re.search(choice_regex, target_page_response.text)) - for elem in content: - if set(get_attributes(elem[0])) == set(filter_attributes): - key = (item,tuple(filter_attributes)) - discount = 0 if len(elem[7]) == 0 else int(elem[7]) - price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13]) + print(settings_item) + item = settings_item[0] + filter_attributes = settings_item[1] - # get item image - image_link = re.findall(magnifier_image_regex, target_page_response.text)[0] - for attr in filter_attributes: - image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"') - image = re.findall(image_regex, target_page_response.text) - if len(image)>0: - image_link = image[0] - image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview - break + url = 'https://aliexpress.com/item/'+str(item)+'.html' + target_page_response = session.get(url) + if target_page_response.status_code == 200: + content = re.findall(item_regex, target_page_response.text) + is_choice = bool(re.search(choice_regex, target_page_response.text)) + for elem in content: + if set(get_attributes(elem[0])) == set(filter_attributes): + key = (item,tuple(filter_attributes)) + discount = 0 if len(elem[7]) == 0 else int(elem[7]) + price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13]) + + # get item image + image_link = re.findall(magnifier_image_regex, target_page_response.text)[0] + for attr in filter_attributes: + image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"') + image = re.findall(image_regex, target_page_response.text) + if len(image)>0: + image_link = image[0] + image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview + break + + # skuId, quantity, discount_percentage, price, currency, choice_delivery, image + extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link} + else: + print(f'Failed to fetch target page. Status code: {target_page_response.status_code}') - # skuId, quantity, discount_percentage, price, currency, choice_delivery, image - extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link} - else: - print(f'Failed to fetch target page. Status code: {target_page_response.status_code}') return extract def get_attributes(attributes_raw): '''return a list of attributes from attributes raw string''' # id_regex = re.compile(r'([0-9]*)=') - attr_regex = re.compile(r'#([0-9a-zA-Z ]*)') + attr_regex = re.compile(r'#([0-9a-zA-Z \.]*)') # item_id = re.search(id_regex, attributes_raw).group(1) attributes = re.findall(attr_regex, attributes_raw) @@ -86,8 +90,9 @@ def fill_db(items_dict): def update_items(): '''add new history entries for items in database''' in_db = get_item_keys() - new_entries = check_items(in_db) - fill_db(new_entries) + for item in in_db: + new_entry = check_item(item) + fill_db(new_entry) diff --git a/src/app.py b/src/app.py index e9c6240..8ef0384 100644 --- a/src/app.py +++ b/src/app.py @@ -16,8 +16,8 @@ def init_db(): @app.route('/update') def update_hist(): print("update") - fill_db(check_items()) - return 'Hello, World!' + update_items() + return 'items updated' @app.route('/add', methods=['POST']) def add_item(): diff --git a/src/background.py b/src/background.py index 12650a2..2fe06f3 100644 --- a/src/background.py +++ b/src/background.py @@ -1,5 +1,28 @@ #!/usr/bin/python -from aliexpress import * +import requests, re, time, os + +def update(): + url = "http://127.0.0.1:8080/update" + response = requests.get(url) + print(response) if __name__ == '__main__': - print("ok") + regex_time = re.compile(r'([1-9][0-9]*)([smhd])') + formatted_time = os.environ.get('WAIT_TIME') + units = { + 's':1, + 'm':60, + 'h':3600, + 'd':86400 + } + match = re.search(regex_time, formatted_time) + if bool(match): + raw_time = float(match.group(1))*units[match.group(2)] + while True: + try: + update() + except: + print("update failed") + time.sleep(raw_time) + else: + print("WAIT_TIME incorrect") diff --git a/src/db.py b/src/db.py index d73a8d5..e26f80e 100644 --- a/src/db.py +++ b/src/db.py @@ -45,12 +45,13 @@ def add_history_entry(itemid, skuid, choice, attributes, image, price, currency, if not check_exist(itemid, skuid): add_item(itemid, skuid, choice, attributes, image) - cursor.execute(""" - SELECT uuid - FROM item - WHERE itemid = %s - AND skuid = %s - """, (itemid, skuid)) + + cursor.execute(""" + SELECT uuid + FROM item + WHERE itemid = %s + AND skuid = %s + """, (itemid, skuid)) uuid = cursor.fetchall()[0] diff --git a/src/main.py b/src/main.py index 998c085..1a1755b 100644 --- a/src/main.py +++ b/src/main.py @@ -3,27 +3,24 @@ import requests, re, json, os, yaml from db import * from aliexpress import * -def get_conf(): - '''return settings in settings.yaml file''' - with open(os.path.dirname(os.path.realpath(__file__))+"/settings.yaml", 'r') as conf_file: - settings = yaml.safe_load(conf_file) - return settings if __name__ == '__main__': - settings = get_conf() - - c_l = [("33058732737",["3 M Probe"])] - print(check_items(c_l)) - c_l = [("1005005769229528", ["2 E27 Clip EU"])] - print(check_items(c_l)) - c_l = [("1005004130931033", [])] - print(check_items(c_l)) + c_l = ["33058732737", ["1.2 M Probe"]] + print(check_item(c_l)) + c_l = ["1005005769229528", ["2 E27 Clip EU"]] + print(check_item(c_l)) + c_l = ["1005004130931033", []] + print(check_item(c_l)) + print("########") + c_l = ["1005006030884318", ["Natural White", "7W", "E27"]] + print(check_item(c_l)) + # print(get_item_keys()) # initialize(settings["db"]) - # fill_db(settings["db"], check_items(settings["item"])) + # fill_db(settings["db"], check_item(settings["item"])) # export_csv(settings["db"])