wait and retry scraping if punish page encountered

This commit is contained in:
Sam Hadow 2024-03-31 22:37:23 +02:00
parent 79d713e3a2
commit 53599af9bf
4 changed files with 43 additions and 11 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ settings.yaml
__pycache__/* __pycache__/*
src/__pycache__/* src/__pycache__/*
web/test.html web/test.html
*.csv

View File

@ -31,6 +31,7 @@ def check_item(settings_item):
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}') item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"') choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>') magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)')
session = requests.Session() session = requests.Session()
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json' cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
@ -46,6 +47,11 @@ def check_item(settings_item):
url = 'https://aliexpress.com/item/'+str(item)+'.html' url = 'https://aliexpress.com/item/'+str(item)+'.html'
target_page_response = session.get(url) target_page_response = session.get(url)
if target_page_response.status_code == 200: if target_page_response.status_code == 200:
punish = bool(re.search(punish_regex, target_page_response.text))
if punish:
raise ValueError("punish")
content = re.findall(item_regex, target_page_response.text) content = re.findall(item_regex, target_page_response.text)
is_choice = bool(re.search(choice_regex, target_page_response.text)) is_choice = bool(re.search(choice_regex, target_page_response.text))
for elem in content: for elem in content:
@ -96,10 +102,27 @@ def fill_db(items_dict):
def update_items(): def update_items():
'''add new history entries for items in database''' '''add new history entries for items in database'''
in_db = get_item_keys() in_db = get_item_keys()
retry = []
for item in in_db: for item in in_db:
time.sleep(2) time.sleep(2)
try:
new_entry = check_item(item) new_entry = check_item(item)
fill_db(new_entry) fill_db(new_entry)
except ValueError:
retry.append(item)
return retry
def retry_update(retry_list):
'''update entries from the retry list only'''
retry = []
for item in retry_list:
time.sleep(2)
try:
new_entry = check_item(item)
fill_db(new_entry)
except ValueError:
retry.append(item)
return retry

View File

@ -1,6 +1,6 @@
from flask import Flask, request, jsonify, render_template from flask import Flask, request, jsonify, render_template
from flask_cors import CORS from flask_cors import CORS
import requests, re, json, os, yaml import requests, re, json, os, yaml, time
from db import * from db import *
from aliexpress import * from aliexpress import *
@ -21,7 +21,12 @@ def init_db():
@app.route('/update', methods = ['GET']) @app.route('/update', methods = ['GET'])
def update_hist(): def update_hist():
print("update") print("update")
update_items() retry = update_items()
retry_count = 0
while len(retry)>0 and retry_count < 12:
time.sleep(300) # wait 5 minutes between each retry
retry = retry_update(retry)
retry_count += 1
return 'items updated' return 'items updated'
@app.route('/app/add', methods=['POST']) @app.route('/app/add', methods=['POST'])
@ -34,20 +39,23 @@ def add_item():
attributes = data.get('attributes', '').split(',') if data.get('attributes') else [] attributes = data.get('attributes', '').split(',') if data.get('attributes') else []
new_item = [itemid, attributes] new_item = [itemid, attributes]
try:
extr = check_item(new_item) extr = check_item(new_item)
except ValueError:
return jsonify({'status': 4, "info": "aliexpress punish page"}), 400
if len(extr) > 0: if len(extr) > 0:
skuid = list(extr.values())[0]["skuid"] skuid = list(extr.values())[0]["skuid"]
if check_exist(itemid, skuid): if check_exist(itemid, skuid):
# item already exists # item already exists
return jsonify({'status': 3}), 400 return jsonify({'status': 3, "info": "item already exists"}), 400
else: else:
# item is valid # item is valid
fill_db(extr) fill_db(extr)
return jsonify({'status': 0}), 200 return jsonify({'status': 0, , "info": "item added to database"}), 200
else: else:
# item not valid or can't be parsed # item not valid or can't be parsed
return jsonify({'status': 1}), 400 return jsonify({'status': 1, "info": "item not valid or can't be parsed"}), 400
@app.route('/app/delete', methods=['POST']) @app.route('/app/delete', methods=['POST'])
def del_item(): def del_item():

View File

@ -34,9 +34,9 @@ if __name__ == '__main__':
c_l = ["1005005824413309", ["00350"]] c_l = ["1005005824413309", ["00350"]]
print(check_item(c_l)) print(check_item(c_l))
# TODO : fix regex for this item #
c_l = ["1005005777900699", ["Black"]] # c_l = ["1005005777900699", ["Black"]]
print(check_item(c_l)) # print(check_item(c_l))
# print(get_item_keys()) # print(get_item_keys())
# initialize(settings["db"]) # initialize(settings["db"])