wait and retry scraping if punish page encountered
This commit is contained in:
parent
79d713e3a2
commit
53599af9bf
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ settings.yaml
|
|||||||
__pycache__/*
|
__pycache__/*
|
||||||
src/__pycache__/*
|
src/__pycache__/*
|
||||||
web/test.html
|
web/test.html
|
||||||
|
*.csv
|
||||||
|
@ -31,6 +31,7 @@ def check_item(settings_item):
|
|||||||
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
||||||
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
||||||
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
|
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
|
||||||
|
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)')
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
|
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
|
||||||
@ -46,6 +47,11 @@ def check_item(settings_item):
|
|||||||
url = 'https://aliexpress.com/item/'+str(item)+'.html'
|
url = 'https://aliexpress.com/item/'+str(item)+'.html'
|
||||||
target_page_response = session.get(url)
|
target_page_response = session.get(url)
|
||||||
if target_page_response.status_code == 200:
|
if target_page_response.status_code == 200:
|
||||||
|
|
||||||
|
punish = bool(re.search(punish_regex, target_page_response.text))
|
||||||
|
if punish:
|
||||||
|
raise ValueError("punish")
|
||||||
|
|
||||||
content = re.findall(item_regex, target_page_response.text)
|
content = re.findall(item_regex, target_page_response.text)
|
||||||
is_choice = bool(re.search(choice_regex, target_page_response.text))
|
is_choice = bool(re.search(choice_regex, target_page_response.text))
|
||||||
for elem in content:
|
for elem in content:
|
||||||
@ -96,10 +102,27 @@ def fill_db(items_dict):
|
|||||||
def update_items():
|
def update_items():
|
||||||
'''add new history entries for items in database'''
|
'''add new history entries for items in database'''
|
||||||
in_db = get_item_keys()
|
in_db = get_item_keys()
|
||||||
|
retry = []
|
||||||
for item in in_db:
|
for item in in_db:
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
new_entry = check_item(item)
|
try:
|
||||||
fill_db(new_entry)
|
new_entry = check_item(item)
|
||||||
|
fill_db(new_entry)
|
||||||
|
except ValueError:
|
||||||
|
retry.append(item)
|
||||||
|
return retry
|
||||||
|
|
||||||
|
def retry_update(retry_list):
|
||||||
|
'''update entries from the retry list only'''
|
||||||
|
retry = []
|
||||||
|
for item in retry_list:
|
||||||
|
time.sleep(2)
|
||||||
|
try:
|
||||||
|
new_entry = check_item(item)
|
||||||
|
fill_db(new_entry)
|
||||||
|
except ValueError:
|
||||||
|
retry.append(item)
|
||||||
|
return retry
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
20
src/app.py
20
src/app.py
@ -1,6 +1,6 @@
|
|||||||
from flask import Flask, request, jsonify, render_template
|
from flask import Flask, request, jsonify, render_template
|
||||||
from flask_cors import CORS
|
from flask_cors import CORS
|
||||||
import requests, re, json, os, yaml
|
import requests, re, json, os, yaml, time
|
||||||
from db import *
|
from db import *
|
||||||
from aliexpress import *
|
from aliexpress import *
|
||||||
|
|
||||||
@ -21,7 +21,12 @@ def init_db():
|
|||||||
@app.route('/update', methods = ['GET'])
|
@app.route('/update', methods = ['GET'])
|
||||||
def update_hist():
|
def update_hist():
|
||||||
print("update")
|
print("update")
|
||||||
update_items()
|
retry = update_items()
|
||||||
|
retry_count = 0
|
||||||
|
while len(retry)>0 and retry_count < 12:
|
||||||
|
time.sleep(300) # wait 5 minutes between each retry
|
||||||
|
retry = retry_update(retry)
|
||||||
|
retry_count += 1
|
||||||
return 'items updated'
|
return 'items updated'
|
||||||
|
|
||||||
@app.route('/app/add', methods=['POST'])
|
@app.route('/app/add', methods=['POST'])
|
||||||
@ -34,20 +39,23 @@ def add_item():
|
|||||||
attributes = data.get('attributes', '').split(',') if data.get('attributes') else []
|
attributes = data.get('attributes', '').split(',') if data.get('attributes') else []
|
||||||
|
|
||||||
new_item = [itemid, attributes]
|
new_item = [itemid, attributes]
|
||||||
extr = check_item(new_item)
|
try:
|
||||||
|
extr = check_item(new_item)
|
||||||
|
except ValueError:
|
||||||
|
return jsonify({'status': 4, "info": "aliexpress punish page"}), 400
|
||||||
|
|
||||||
if len(extr) > 0:
|
if len(extr) > 0:
|
||||||
skuid = list(extr.values())[0]["skuid"]
|
skuid = list(extr.values())[0]["skuid"]
|
||||||
if check_exist(itemid, skuid):
|
if check_exist(itemid, skuid):
|
||||||
# item already exists
|
# item already exists
|
||||||
return jsonify({'status': 3}), 400
|
return jsonify({'status': 3, "info": "item already exists"}), 400
|
||||||
else:
|
else:
|
||||||
# item is valid
|
# item is valid
|
||||||
fill_db(extr)
|
fill_db(extr)
|
||||||
return jsonify({'status': 0}), 200
|
return jsonify({'status': 0, , "info": "item added to database"}), 200
|
||||||
else:
|
else:
|
||||||
# item not valid or can't be parsed
|
# item not valid or can't be parsed
|
||||||
return jsonify({'status': 1}), 400
|
return jsonify({'status': 1, "info": "item not valid or can't be parsed"}), 400
|
||||||
|
|
||||||
@app.route('/app/delete', methods=['POST'])
|
@app.route('/app/delete', methods=['POST'])
|
||||||
def del_item():
|
def del_item():
|
||||||
|
@ -34,9 +34,9 @@ if __name__ == '__main__':
|
|||||||
c_l = ["1005005824413309", ["00350"]]
|
c_l = ["1005005824413309", ["00350"]]
|
||||||
print(check_item(c_l))
|
print(check_item(c_l))
|
||||||
|
|
||||||
# TODO : fix regex for this item
|
#
|
||||||
c_l = ["1005005777900699", ["Black"]]
|
# c_l = ["1005005777900699", ["Black"]]
|
||||||
print(check_item(c_l))
|
# print(check_item(c_l))
|
||||||
# print(get_item_keys())
|
# print(get_item_keys())
|
||||||
|
|
||||||
# initialize(settings["db"])
|
# initialize(settings["db"])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user