wait and retry scraping if punish page encountered
This commit is contained in:
parent
79d713e3a2
commit
53599af9bf
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ settings.yaml
|
||||
__pycache__/*
|
||||
src/__pycache__/*
|
||||
web/test.html
|
||||
*.csv
|
||||
|
@ -31,6 +31,7 @@ def check_item(settings_item):
|
||||
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
||||
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
||||
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
|
||||
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)')
|
||||
|
||||
session = requests.Session()
|
||||
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
|
||||
@ -46,6 +47,11 @@ def check_item(settings_item):
|
||||
url = 'https://aliexpress.com/item/'+str(item)+'.html'
|
||||
target_page_response = session.get(url)
|
||||
if target_page_response.status_code == 200:
|
||||
|
||||
punish = bool(re.search(punish_regex, target_page_response.text))
|
||||
if punish:
|
||||
raise ValueError("punish")
|
||||
|
||||
content = re.findall(item_regex, target_page_response.text)
|
||||
is_choice = bool(re.search(choice_regex, target_page_response.text))
|
||||
for elem in content:
|
||||
@ -96,10 +102,27 @@ def fill_db(items_dict):
|
||||
def update_items():
|
||||
'''add new history entries for items in database'''
|
||||
in_db = get_item_keys()
|
||||
retry = []
|
||||
for item in in_db:
|
||||
time.sleep(2)
|
||||
new_entry = check_item(item)
|
||||
fill_db(new_entry)
|
||||
try:
|
||||
new_entry = check_item(item)
|
||||
fill_db(new_entry)
|
||||
except ValueError:
|
||||
retry.append(item)
|
||||
return retry
|
||||
|
||||
def retry_update(retry_list):
|
||||
'''update entries from the retry list only'''
|
||||
retry = []
|
||||
for item in retry_list:
|
||||
time.sleep(2)
|
||||
try:
|
||||
new_entry = check_item(item)
|
||||
fill_db(new_entry)
|
||||
except ValueError:
|
||||
retry.append(item)
|
||||
return retry
|
||||
|
||||
|
||||
|
||||
|
20
src/app.py
20
src/app.py
@ -1,6 +1,6 @@
|
||||
from flask import Flask, request, jsonify, render_template
|
||||
from flask_cors import CORS
|
||||
import requests, re, json, os, yaml
|
||||
import requests, re, json, os, yaml, time
|
||||
from db import *
|
||||
from aliexpress import *
|
||||
|
||||
@ -21,7 +21,12 @@ def init_db():
|
||||
@app.route('/update', methods = ['GET'])
|
||||
def update_hist():
|
||||
print("update")
|
||||
update_items()
|
||||
retry = update_items()
|
||||
retry_count = 0
|
||||
while len(retry)>0 and retry_count < 12:
|
||||
time.sleep(300) # wait 5 minutes between each retry
|
||||
retry = retry_update(retry)
|
||||
retry_count += 1
|
||||
return 'items updated'
|
||||
|
||||
@app.route('/app/add', methods=['POST'])
|
||||
@ -34,20 +39,23 @@ def add_item():
|
||||
attributes = data.get('attributes', '').split(',') if data.get('attributes') else []
|
||||
|
||||
new_item = [itemid, attributes]
|
||||
extr = check_item(new_item)
|
||||
try:
|
||||
extr = check_item(new_item)
|
||||
except ValueError:
|
||||
return jsonify({'status': 4, "info": "aliexpress punish page"}), 400
|
||||
|
||||
if len(extr) > 0:
|
||||
skuid = list(extr.values())[0]["skuid"]
|
||||
if check_exist(itemid, skuid):
|
||||
# item already exists
|
||||
return jsonify({'status': 3}), 400
|
||||
return jsonify({'status': 3, "info": "item already exists"}), 400
|
||||
else:
|
||||
# item is valid
|
||||
fill_db(extr)
|
||||
return jsonify({'status': 0}), 200
|
||||
return jsonify({'status': 0, , "info": "item added to database"}), 200
|
||||
else:
|
||||
# item not valid or can't be parsed
|
||||
return jsonify({'status': 1}), 400
|
||||
return jsonify({'status': 1, "info": "item not valid or can't be parsed"}), 400
|
||||
|
||||
@app.route('/app/delete', methods=['POST'])
|
||||
def del_item():
|
||||
|
@ -34,9 +34,9 @@ if __name__ == '__main__':
|
||||
c_l = ["1005005824413309", ["00350"]]
|
||||
print(check_item(c_l))
|
||||
|
||||
# TODO : fix regex for this item
|
||||
c_l = ["1005005777900699", ["Black"]]
|
||||
print(check_item(c_l))
|
||||
#
|
||||
# c_l = ["1005005777900699", ["Black"]]
|
||||
# print(check_item(c_l))
|
||||
# print(get_item_keys())
|
||||
|
||||
# initialize(settings["db"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user