regex catastrophic backtracking fix + background job update db

This commit is contained in:
Sam Hadow 2024-01-31 21:56:16 +01:00
parent 3b6e66c886
commit c7a4bc711d
8 changed files with 103 additions and 54 deletions

20
CronDockerfile Normal file
View File

@ -0,0 +1,20 @@
FROM python:bookworm
COPY /Cronrequirements.txt /
RUN pip3 install --upgrade pip
RUN pip3 install -r /Cronrequirements.txt
COPY ./src/background.py /app/background.py
WORKDIR /app
ENV WAIT_TIME="1d"
CMD ["python", "background.py"]

1
Cronrequirements.txt Normal file
View File

@ -0,0 +1 @@
requests

View File

@ -1,8 +1,10 @@
podman build --tag alipricetrack:1.0.0 -f ./Dockerfile podman build --tag alipricetrack:1.0.0 -f ./Dockerfile
podman build --tag alipricetrack_cron:1.0.0 -f ./CronDockerfile
podman pod create --name aliexpress -p 8086:8080 podman pod create --name aliexpress -p 8086:8080
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" --name ali-app alipricetrack:1.0.0 podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" --name ali-app alipricetrack:1.0.0
podman run -d --pod=aliexpress --name ali-cron alipricetrack_cron:1.0.0

View File

@ -19,7 +19,7 @@ def load_cookies_from_file(file_path):
return cookies_dict return cookies_dict
def check_items(settings_items): def check_item(settings_item):
''' '''
return a dict with items data extracted from aliexpress. return a dict with items data extracted from aliexpress.
a file containing aliexpress login token cookies has to be provided in ./cookies.json (obtained with cookie-quick-manager https://github.com/ysard/cookie-quick-manager/) for accurate prices (no "welcome discount") a file containing aliexpress login token cookies has to be provided in ./cookies.json (obtained with cookie-quick-manager https://github.com/ysard/cookie-quick-manager/) for accurate prices (no "welcome discount")
@ -29,7 +29,7 @@ def check_items(settings_items):
itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter. itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter.
''' '''
item_regex = re.compile(r'skuAttr\\\":\\\"([0-9:;]*#([a-zA-Z0-9 ]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}') item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#[a-zA-Z0-9 \.]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"') choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>') magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
@ -40,38 +40,42 @@ def check_items(settings_items):
extract = dict() extract = dict()
for (item,filter_attributes) in settings_items: print(settings_item)
url = 'https://aliexpress.com/item/'+item+'.html' item = settings_item[0]
target_page_response = session.get(url) filter_attributes = settings_item[1]
if target_page_response.status_code == 200:
content = re.findall(item_regex, target_page_response.text)
is_choice = bool(re.search(choice_regex, target_page_response.text))
for elem in content:
if set(get_attributes(elem[0])) == set(filter_attributes):
key = (item,tuple(filter_attributes))
discount = 0 if len(elem[7]) == 0 else int(elem[7])
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
# get item image url = 'https://aliexpress.com/item/'+str(item)+'.html'
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0] target_page_response = session.get(url)
for attr in filter_attributes: if target_page_response.status_code == 200:
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"') content = re.findall(item_regex, target_page_response.text)
image = re.findall(image_regex, target_page_response.text) is_choice = bool(re.search(choice_regex, target_page_response.text))
if len(image)>0: for elem in content:
image_link = image[0] if set(get_attributes(elem[0])) == set(filter_attributes):
image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview key = (item,tuple(filter_attributes))
break discount = 0 if len(elem[7]) == 0 else int(elem[7])
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
# get item image
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
for attr in filter_attributes:
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
image = re.findall(image_regex, target_page_response.text)
if len(image)>0:
image_link = image[0]
image_link = re.sub(r'jpg_[0-9]+x[0-9]+', "jpg_800x800", image_link) # get bigger image instead of preview
break
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link}
else:
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": elem[12], "choice_delivery": is_choice, "image": image_link}
else:
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
return extract return extract
def get_attributes(attributes_raw): def get_attributes(attributes_raw):
'''return a list of attributes from attributes raw string''' '''return a list of attributes from attributes raw string'''
# id_regex = re.compile(r'([0-9]*)=') # id_regex = re.compile(r'([0-9]*)=')
attr_regex = re.compile(r'#([0-9a-zA-Z ]*)') attr_regex = re.compile(r'#([0-9a-zA-Z \.]*)')
# item_id = re.search(id_regex, attributes_raw).group(1) # item_id = re.search(id_regex, attributes_raw).group(1)
attributes = re.findall(attr_regex, attributes_raw) attributes = re.findall(attr_regex, attributes_raw)
@ -86,8 +90,9 @@ def fill_db(items_dict):
def update_items(): def update_items():
'''add new history entries for items in database''' '''add new history entries for items in database'''
in_db = get_item_keys() in_db = get_item_keys()
new_entries = check_items(in_db) for item in in_db:
fill_db(new_entries) new_entry = check_item(item)
fill_db(new_entry)

View File

@ -16,8 +16,8 @@ def init_db():
@app.route('/update') @app.route('/update')
def update_hist(): def update_hist():
print("update") print("update")
fill_db(check_items()) update_items()
return 'Hello, World!' return 'items updated'
@app.route('/add', methods=['POST']) @app.route('/add', methods=['POST'])
def add_item(): def add_item():

View File

@ -1,5 +1,28 @@
#!/usr/bin/python #!/usr/bin/python
from aliexpress import * import requests, re, time, os
def update():
url = "http://127.0.0.1:8080/update"
response = requests.get(url)
print(response)
if __name__ == '__main__': if __name__ == '__main__':
print("ok") regex_time = re.compile(r'([1-9][0-9]*)([smhd])')
formatted_time = os.environ.get('WAIT_TIME')
units = {
's':1,
'm':60,
'h':3600,
'd':86400
}
match = re.search(regex_time, formatted_time)
if bool(match):
raw_time = float(match.group(1))*units[match.group(2)]
while True:
try:
update()
except:
print("update failed")
time.sleep(raw_time)
else:
print("WAIT_TIME incorrect")

View File

@ -45,12 +45,13 @@ def add_history_entry(itemid, skuid, choice, attributes, image, price, currency,
if not check_exist(itemid, skuid): if not check_exist(itemid, skuid):
add_item(itemid, skuid, choice, attributes, image) add_item(itemid, skuid, choice, attributes, image)
cursor.execute("""
SELECT uuid cursor.execute("""
FROM item SELECT uuid
WHERE itemid = %s FROM item
AND skuid = %s WHERE itemid = %s
""", (itemid, skuid)) AND skuid = %s
""", (itemid, skuid))
uuid = cursor.fetchall()[0] uuid = cursor.fetchall()[0]

View File

@ -3,27 +3,24 @@ import requests, re, json, os, yaml
from db import * from db import *
from aliexpress import * from aliexpress import *
def get_conf():
'''return settings in settings.yaml file'''
with open(os.path.dirname(os.path.realpath(__file__))+"/settings.yaml", 'r') as conf_file:
settings = yaml.safe_load(conf_file)
return settings
if __name__ == '__main__': if __name__ == '__main__':
settings = get_conf() c_l = ["33058732737", ["1.2 M Probe"]]
print(check_item(c_l))
c_l = [("33058732737",["3 M Probe"])] c_l = ["1005005769229528", ["2 E27 Clip EU"]]
print(check_items(c_l)) print(check_item(c_l))
c_l = [("1005005769229528", ["2 E27 Clip EU"])] c_l = ["1005004130931033", []]
print(check_items(c_l)) print(check_item(c_l))
c_l = [("1005004130931033", [])] print("########")
print(check_items(c_l)) c_l = ["1005006030884318", ["Natural White", "7W", "E27"]]
print(check_item(c_l))
# print(get_item_keys())
# initialize(settings["db"]) # initialize(settings["db"])
# fill_db(settings["db"], check_items(settings["item"])) # fill_db(settings["db"], check_item(settings["item"]))
# export_csv(settings["db"]) # export_csv(settings["db"])