regex catastrophic backtracking fix + background job update db

This commit is contained in:
Sam Hadow 2024-01-31 21:56:16 +01:00
parent 3b6e66c886
commit c7a4bc711d
8 changed files with 103 additions and 54 deletions

20
CronDockerfile Normal file
View File

@ -0,0 +1,20 @@
FROM python:bookworm
COPY /Cronrequirements.txt /
RUN pip3 install --upgrade pip
RUN pip3 install -r /Cronrequirements.txt
COPY ./src/background.py /app/background.py
WORKDIR /app
ENV WAIT_TIME="1d"
CMD ["python", "background.py"]

1
Cronrequirements.txt Normal file
View File

@ -0,0 +1 @@
requests

View File

@ -1,8 +1,10 @@
podman build --tag alipricetrack:1.0.0 -f ./Dockerfile
podman build --tag alipricetrack_cron:1.0.0 -f ./CronDockerfile
podman pod create --name aliexpress -p 8086:8080
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" --name ali-app alipricetrack:1.0.0
podman run -d --pod=aliexpress --name ali-cron alipricetrack_cron:1.0.0

View File

@ -19,7 +19,7 @@ def load_cookies_from_file(file_path):
return cookies_dict
def check_items(settings_items):
def check_item(settings_item):
'''
return a dict with items data extracted from aliexpress.
a file containing aliexpress login token cookies has to be provided in ./cookies.json (obtained with cookie-quick-manager https://github.com/ysard/cookie-quick-manager/) for accurate prices (no "welcome discount")
@ -29,7 +29,7 @@ def check_items(settings_items):
itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter.
'''
item_regex = re.compile(r'skuAttr\\\":\\\"([0-9:;]*#([a-zA-Z0-9 ]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#[a-zA-Z0-9 \.]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
@ -40,38 +40,42 @@ def check_items(settings_items):
extract = dict()
for (item,filter_attributes) in settings_items:
url = 'https://aliexpress.com/item/'+item+'.html'
target_page_response = session.get(url)
if target_page_response.status_code == 200:
content = re.findall(item_regex, target_page_response.text)
is_choice = bool(re.search(choice_regex, target_page_response.text))
for elem in content:
if set(get_attributes(elem[0])) == set(filter_attributes):
key = (item,tuple(filter_attributes))
discount = 0 if len(elem[7]) == 0 else int(elem[7])
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
print(settings_item)
item = settings_item[0]
filter_attributes = settings_item[1]
# get item image
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
for attr in filter_attributes:
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
image = re.findall(image_regex, target_page_response.text)
if len(image)>0:
image_link = image[0]
image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview
break
url = 'https://aliexpress.com/item/'+str(item)+'.html'
target_page_response = session.get(url)
if target_page_response.status_code == 200:
content = re.findall(item_regex, target_page_response.text)
is_choice = bool(re.search(choice_regex, target_page_response.text))
for elem in content:
if set(get_attributes(elem[0])) == set(filter_attributes):
key = (item,tuple(filter_attributes))
discount = 0 if len(elem[7]) == 0 else int(elem[7])
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
# get item image
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
for attr in filter_attributes:
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
image = re.findall(image_regex, target_page_response.text)
if len(image)>0:
image_link = image[0]
image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview
break
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link}
else:
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link}
else:
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
return extract
def get_attributes(attributes_raw):
'''return a list of attributes from attributes raw string'''
# id_regex = re.compile(r'([0-9]*)=')
attr_regex = re.compile(r'#([0-9a-zA-Z ]*)')
attr_regex = re.compile(r'#([0-9a-zA-Z \.]*)')
# item_id = re.search(id_regex, attributes_raw).group(1)
attributes = re.findall(attr_regex, attributes_raw)
@ -86,8 +90,9 @@ def fill_db(items_dict):
def update_items():
'''add new history entries for items in database'''
in_db = get_item_keys()
new_entries = check_items(in_db)
fill_db(new_entries)
for item in in_db:
new_entry = check_item(item)
fill_db(new_entry)

View File

@ -16,8 +16,8 @@ def init_db():
@app.route('/update')
def update_hist():
print("update")
fill_db(check_items())
return 'Hello, World!'
update_items()
return 'items updated'
@app.route('/add', methods=['POST'])
def add_item():

View File

@ -1,5 +1,28 @@
#!/usr/bin/python
from aliexpress import *
import requests, re, time, os
def update():
url = "http://127.0.0.1:8080/update"
response = requests.get(url)
print(response)
if __name__ == '__main__':
print("ok")
regex_time = re.compile(r'([1-9][0-9]*)([smhd])')
formatted_time = os.environ.get('WAIT_TIME')
units = {
's':1,
'm':60,
'h':3600,
'd':86400
}
match = re.search(regex_time, formatted_time)
if bool(match):
raw_time = float(match.group(1))*units[match.group(2)]
while True:
try:
update()
except:
print("update failed")
time.sleep(raw_time)
else:
print("WAIT_TIME incorrect")

View File

@ -45,12 +45,13 @@ def add_history_entry(itemid, skuid, choice, attributes, image, price, currency,
if not check_exist(itemid, skuid):
add_item(itemid, skuid, choice, attributes, image)
cursor.execute("""
SELECT uuid
FROM item
WHERE itemid = %s
AND skuid = %s
""", (itemid, skuid))
cursor.execute("""
SELECT uuid
FROM item
WHERE itemid = %s
AND skuid = %s
""", (itemid, skuid))
uuid = cursor.fetchall()[0]

View File

@ -3,27 +3,24 @@ import requests, re, json, os, yaml
from db import *
from aliexpress import *
def get_conf():
'''return settings in settings.yaml file'''
with open(os.path.dirname(os.path.realpath(__file__))+"/settings.yaml", 'r') as conf_file:
settings = yaml.safe_load(conf_file)
return settings
if __name__ == '__main__':
settings = get_conf()
c_l = [("33058732737",["3 M Probe"])]
print(check_items(c_l))
c_l = [("1005005769229528", ["2 E27 Clip EU"])]
print(check_items(c_l))
c_l = [("1005004130931033", [])]
print(check_items(c_l))
c_l = ["33058732737", ["1.2 M Probe"]]
print(check_item(c_l))
c_l = ["1005005769229528", ["2 E27 Clip EU"]]
print(check_item(c_l))
c_l = ["1005004130931033", []]
print(check_item(c_l))
print("########")
c_l = ["1005006030884318", ["Natural White", "7W", "E27"]]
print(check_item(c_l))
# print(get_item_keys())
# initialize(settings["db"])
# fill_db(settings["db"], check_items(settings["item"]))
# fill_db(settings["db"], check_item(settings["item"]))
# export_csv(settings["db"])