fixed scraper, use selenium instead of requests
This commit is contained in:
parent
41152178cd
commit
59f7b31ea4
@ -20,4 +20,6 @@ EXPOSE 8080
|
|||||||
ENV POSTGRES_HOST=127.0.0.1
|
ENV POSTGRES_HOST=127.0.0.1
|
||||||
ENV POSTGRES_PORT=5432
|
ENV POSTGRES_PORT=5432
|
||||||
|
|
||||||
CMD ["gunicorn","--config", "gunicorn_config.py", "app:app"]
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
CMD ["gunicorn","--config", "gunicorn_config.py", "app:app", "--log-level", "debug", "--enable-stdio-inheritance"]
|
||||||
|
@ -3,6 +3,8 @@ podman build --tag alipricetrack_cron:1.0.0 -f ./CronDockerfile
|
|||||||
|
|
||||||
podman pod create --name aliexpress -p 8086:8080
|
podman pod create --name aliexpress -p 8086:8080
|
||||||
|
|
||||||
|
podman run -d --pod=aliexpress --name ali-selenium-firefox docker.io/selenium/standalone-firefox:latest
|
||||||
|
|
||||||
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres
|
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres
|
||||||
|
|
||||||
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/cookies.json:/app/cookies.json:Z --name ali-app alipricetrack:1.0.0
|
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/cookies.json:/app/cookies.json:Z --name ali-app alipricetrack:1.0.0
|
||||||
|
@ -4,3 +4,4 @@ gunicorn
|
|||||||
psycopg2-binary
|
psycopg2-binary
|
||||||
requests
|
requests
|
||||||
pyyaml
|
pyyaml
|
||||||
|
selenium
|
||||||
|
@ -1,83 +1,250 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
import requests, re, json, os, yaml, time
|
import requests, re, json, os, yaml, time
|
||||||
from db import *
|
from db import *
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.service import Service
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
def load_cookies_from_file(file_path):
|
from selenium.webdriver.firefox.options import Options
|
||||||
'''load cookies from a file and return a dict usable in a request session'''
|
from time import sleep
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
def load_cookies_from_file_selenium(file_path):
|
||||||
|
'''Load cookies from a file and return a list usable in a browser session'''
|
||||||
with open(file_path, 'r') as file:
|
with open(file_path, 'r') as file:
|
||||||
cookies_data = json.load(file)
|
cookies_data = json.load(file)
|
||||||
|
|
||||||
cookies_dict = {}
|
cookies_list = []
|
||||||
|
|
||||||
for cookie_data in cookies_data:
|
for cookie_data in cookies_data:
|
||||||
name_raw = cookie_data.get("Name raw", "")
|
name_raw = cookie_data.get("Name raw", "")
|
||||||
content_raw = cookie_data.get("Content raw", "")
|
content_raw = cookie_data.get("Content raw", "")
|
||||||
cookie_value = f"{content_raw}"
|
cookie_value = f"{content_raw}"
|
||||||
|
cookie_path = cookie_data.get("Path raw", "")
|
||||||
|
|
||||||
|
cookie_dict = {"name": name_raw, "value": content_raw, "path": cookie_path}
|
||||||
|
|
||||||
if len(cookie_value) > 0:
|
if len(cookie_value) > 0:
|
||||||
cookies_dict[name_raw] = cookie_value
|
cookies_list.append(cookie_dict)
|
||||||
|
|
||||||
return cookies_dict
|
return cookies_list
|
||||||
|
|
||||||
def check_item(settings_item):
|
def check_item(settings_item):
|
||||||
'''
|
'''
|
||||||
return a dict with items data extracted from aliexpress.
|
return a dict with items data extracted from aliexpress.
|
||||||
extracted data:
|
extracted data:
|
||||||
skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
||||||
parameter settings_item is a list of tables (string(itemid), attributes)
|
parameter settings_item is a list (string(itemid), attributes)
|
||||||
itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter.
|
itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter.
|
||||||
'''
|
'''
|
||||||
|
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)|(FAIL_SYS_ILLEGAL_ACCESS)')
|
||||||
|
number_regex = re.compile(r'[0-9]+')
|
||||||
|
price_regex = re.compile(r'[0-9]*,[0-9]{0,2}')
|
||||||
|
|
||||||
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
|
|
||||||
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
|
|
||||||
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
|
|
||||||
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)')
|
|
||||||
|
|
||||||
session = requests.Session()
|
##### to use with selenium firefox container
|
||||||
|
driver = webdriver.Remote(
|
||||||
|
command_executor='http://127.0.0.1:4444',
|
||||||
|
options=webdriver.FirefoxOptions()
|
||||||
|
)
|
||||||
|
|
||||||
|
##### for testing with local geckodriver
|
||||||
|
# options=Options()
|
||||||
|
# service = webdriver.FirefoxService(executable_path='/bin/geckodriver')
|
||||||
|
# profile = webdriver.FirefoxProfile()
|
||||||
|
# options.profile = profile
|
||||||
|
# driver = webdriver.Firefox(service=service, options=options)
|
||||||
|
|
||||||
|
# load login cookies
|
||||||
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
|
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
|
||||||
cookies = load_cookies_from_file(cookies_file_path)
|
cookies = load_cookies_from_file_selenium(cookies_file_path)
|
||||||
session.cookies.update(cookies)
|
driver.get("https://aliexpress.com")
|
||||||
|
sleep(random.uniform(3, 6))
|
||||||
|
driver.delete_all_cookies()
|
||||||
|
for cookie in cookies:
|
||||||
|
driver.add_cookie(cookie)
|
||||||
|
|
||||||
|
sleep(random.uniform(3, 6))
|
||||||
|
|
||||||
|
|
||||||
|
# accept cookies
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, "global-gdpr-btn-wrap"))
|
||||||
|
)
|
||||||
|
accept_button = WebDriverWait(driver, 10).until(
|
||||||
|
EC.element_to_be_clickable((By.CLASS_NAME, "btn-accept"))
|
||||||
|
)
|
||||||
|
sleep(random.uniform(2, 4))
|
||||||
|
accept_button.click()
|
||||||
|
|
||||||
|
print("Cookies accepted")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
extract = dict()
|
|
||||||
|
|
||||||
print(settings_item)
|
print(settings_item)
|
||||||
item = settings_item[0]
|
item = settings_item[0]
|
||||||
filter_attributes = settings_item[1]
|
filter_attributes = settings_item[1]
|
||||||
|
|
||||||
url = 'https://aliexpress.com/item/'+str(item)+'.html'
|
url = 'https://aliexpress.com/item/'+str(item)+'.html'
|
||||||
target_page_response = session.get(url)
|
|
||||||
if target_page_response.status_code == 200:
|
|
||||||
|
|
||||||
punish = bool(re.search(punish_regex, target_page_response.text))
|
driver.get(url)
|
||||||
|
|
||||||
|
# check if punish page hit
|
||||||
|
punish = bool(re.search(punish_regex, driver.page_source))
|
||||||
if punish:
|
if punish:
|
||||||
|
print("punish page")
|
||||||
|
driver.quit()
|
||||||
raise ValueError("punish")
|
raise ValueError("punish")
|
||||||
|
|
||||||
content = re.findall(item_regex, target_page_response.text)
|
|
||||||
is_choice = bool(re.search(choice_regex, target_page_response.text))
|
|
||||||
for elem in content:
|
|
||||||
if set(get_attributes(elem[0])) == set(filter_attributes):
|
|
||||||
key = (item,tuple(filter_attributes))
|
|
||||||
discount = 0 if len(elem[7]) == 0 else int(elem[7])
|
|
||||||
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
|
|
||||||
|
|
||||||
# get item image
|
|
||||||
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
|
|
||||||
for attr in filter_attributes:
|
|
||||||
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
|
|
||||||
image = re.findall(image_regex, target_page_response.text)
|
|
||||||
if len(image)>0:
|
|
||||||
image_link = image[0]
|
|
||||||
image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', image_link) # get bigger image instead of preview
|
|
||||||
break
|
|
||||||
|
|
||||||
# get currency
|
|
||||||
currency = elem[12] if (len(elem[12])>0) else elem[14]
|
|
||||||
|
|
||||||
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
|
||||||
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": currency, "choice_delivery": is_choice, "image": image_link}
|
|
||||||
else:
|
else:
|
||||||
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}')
|
# refresh page to have the price in a single span
|
||||||
|
driver.refresh()
|
||||||
|
# click on each attribute
|
||||||
|
for attribute in filter_attributes:
|
||||||
|
if bool(re.search(" ", attribute)):
|
||||||
|
possible_attribute = [attribute, re.sub(" ", " ", attribute)]
|
||||||
|
else:
|
||||||
|
possible_attribute = [attribute]
|
||||||
|
for attr in possible_attribute:
|
||||||
|
try:
|
||||||
|
# try to find an image with an alt text corresponding to the attribute
|
||||||
|
img_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, f"//img[@alt='{attr}']"))
|
||||||
|
)
|
||||||
|
|
||||||
|
location = img_element.location
|
||||||
|
size = img_element.size
|
||||||
|
|
||||||
|
# click on where the image is (images appear to always be 60x60px)
|
||||||
|
center_x = (location['x'] + size['width'] / 2) + random.uniform(-10,10)
|
||||||
|
center_y = (location['y'] + size['height'] / 2) + random.uniform(-10,10)
|
||||||
|
|
||||||
|
sleep(random.uniform(2, 4))
|
||||||
|
actions = ActionChains(driver)
|
||||||
|
actions.move_by_offset(center_x, center_y).click().perform()
|
||||||
|
|
||||||
|
print(f"clicked on {attr}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
# try to find a div with corresponding text instead
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, f"//div[@title='{attr}']//span[text()='{attr}']"))
|
||||||
|
)
|
||||||
|
|
||||||
|
sleep(random.uniform(2, 4))
|
||||||
|
actions = ActionChains(driver)
|
||||||
|
actions.move_to_element(div_element).click().perform()
|
||||||
|
|
||||||
|
print(f"clicked on {attr}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Div or image {attr} not found: {e}")
|
||||||
|
|
||||||
|
### scrapping data
|
||||||
|
# price and currency
|
||||||
|
try:
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'product-price-current')]"))
|
||||||
|
)
|
||||||
|
span_element = div_element.find_element(By.XPATH, ".//span[contains(@class, 'product-price-value')]")
|
||||||
|
|
||||||
|
price_text = span_element.text
|
||||||
|
price = float(re.sub(",", ".", re.findall(price_regex, price_text)[0]))
|
||||||
|
currency = re.sub(" ", "", re.sub(price_regex, "", price_text))
|
||||||
|
|
||||||
|
print(f"The extracted price is: {price}, the extracted currency is: {currency}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
# discount percentage
|
||||||
|
discount = 0
|
||||||
|
try:
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'price--original--wEueRiZ')]"))
|
||||||
|
)
|
||||||
|
span_element = div_element.find_element(By.XPATH, ".//span[contains(@class, 'price--discount--Y9uG2LK')]")
|
||||||
|
|
||||||
|
discount_text = span_element.text
|
||||||
|
discount = re.findall(number_regex, discount_text)[0]
|
||||||
|
|
||||||
|
print(f"The extracted discount is: {discount}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
# quantity
|
||||||
|
quantity = '0'
|
||||||
|
try:
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'quantity--info--jnoo_pD')]"))
|
||||||
|
)
|
||||||
|
span_element = div_element.find_element(By.XPATH, ".//span")
|
||||||
|
|
||||||
|
quantity_text = span_element.text
|
||||||
|
quantity = re.findall(number_regex, quantity_text)[0]
|
||||||
|
|
||||||
|
print(f"The extracted quantity is: {quantity}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# image link
|
||||||
|
try:
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'magnifier--wrap--cF4cafd')]"))
|
||||||
|
)
|
||||||
|
|
||||||
|
img_element = div_element.find_element(By.XPATH, ".//img[contains(@class, 'magnifier--image--EYYoSlr')]")
|
||||||
|
|
||||||
|
img_src = img_element.get_attribute("src")
|
||||||
|
image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', img_src) # get bigger image instead of preview
|
||||||
|
|
||||||
|
print(f"The extracted image URL is: {image_link}")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
# if the image in the magnifier wrap doesn't exist it might be a video instead (image is video preview)
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'video--wrap--EhkqzuR')]"))
|
||||||
|
)
|
||||||
|
|
||||||
|
img_element = div_element.find_element(By.XPATH, ".//video[contains(@class, 'video--video--lsI7y97')]")
|
||||||
|
|
||||||
|
img_src = img_element.get_attribute("poster")
|
||||||
|
image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', img_src) # get bigger image instead of preview
|
||||||
|
|
||||||
|
print(f"The extracted image URL is: {image_link}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
# choice
|
||||||
|
choice = False
|
||||||
|
try:
|
||||||
|
div_element = WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'choice-mind--box--fJKH05M')]"))
|
||||||
|
)
|
||||||
|
img_element = div_element.find_element(By.XPATH, ".//img")
|
||||||
|
if img_element.get_attribute("src") == "https://ae01.alicdn.com/kf/S2eef439ead604da69a385490d86118c97/98x42.png":
|
||||||
|
choice = True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
print(f"Choice delivery: {choice}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# return the data scraped
|
||||||
|
extract = dict()
|
||||||
|
key = (item,tuple(filter_attributes))
|
||||||
|
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
|
||||||
|
extract[key] = {"skuid": '0', "quantity": quantity, "discount_percentage": discount, "price": price, "currency": currency, "choice_delivery": choice, "image": image_link}
|
||||||
|
print(extract)
|
||||||
|
driver.quit()
|
||||||
return extract
|
return extract
|
||||||
|
|
||||||
def get_attributes(attributes_raw):
|
def get_attributes(attributes_raw):
|
||||||
@ -124,6 +291,3 @@ def retry_update(retry_list):
|
|||||||
retry.append(item)
|
retry.append(item)
|
||||||
return retry
|
return retry
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
14
src/main.py
14
src/main.py
@ -7,7 +7,7 @@ from aliexpress import *
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# c_l = ["33058732737", ["1.2 M Probe"]]
|
# c_l = ["33058732737", ["3 M Probe"]]
|
||||||
# print(check_item(c_l))
|
# print(check_item(c_l))
|
||||||
# c_l = ["1005005769229528", ["2 E27 Clip EU"]]
|
# c_l = ["1005005769229528", ["2 E27 Clip EU"]]
|
||||||
# print(check_item(c_l))
|
# print(check_item(c_l))
|
||||||
@ -23,17 +23,17 @@ if __name__ == '__main__':
|
|||||||
#
|
#
|
||||||
# c_l = ["1005005967514183", ["1KG-White"]]
|
# c_l = ["1005005967514183", ["1KG-White"]]
|
||||||
# print(check_item(c_l))
|
# print(check_item(c_l))
|
||||||
#
|
# #
|
||||||
#
|
# #
|
||||||
# c_l = ["1005006062371246", ["Pr Tactile 62g x35"]]
|
# c_l = ["1005006062371246", ["Pr Tactile 62g x35"]]
|
||||||
# print(check_item(c_l))
|
# print(check_item(c_l))
|
||||||
|
|
||||||
# c_l = ["1005005676358693", ["5x250mm 5pcs"]]
|
c_l = ["1005005676358693", ["5x250mm 5pcs"]]
|
||||||
# print(check_item(c_l))
|
|
||||||
|
|
||||||
c_l = ["1005005824413309", ["00350"]]
|
|
||||||
print(check_item(c_l))
|
print(check_item(c_l))
|
||||||
|
|
||||||
|
# c_l = ["1005005824413309", ["00350"]]
|
||||||
|
# print(check_item(c_l))
|
||||||
|
|
||||||
#
|
#
|
||||||
# c_l = ["1005005777900699", ["Black"]]
|
# c_l = ["1005005777900699", ["Black"]]
|
||||||
# print(check_item(c_l))
|
# print(check_item(c_l))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user