fixed scraper, use selenium instead of requests

This commit is contained in:
Sam Hadow 2024-07-20 22:25:15 +02:00
parent 41152178cd
commit 59f7b31ea4
5 changed files with 224 additions and 55 deletions

View File

@ -20,4 +20,6 @@ EXPOSE 8080
ENV POSTGRES_HOST=127.0.0.1 ENV POSTGRES_HOST=127.0.0.1
ENV POSTGRES_PORT=5432 ENV POSTGRES_PORT=5432
CMD ["gunicorn","--config", "gunicorn_config.py", "app:app"] ENV PYTHONUNBUFFERED=1
CMD ["gunicorn","--config", "gunicorn_config.py", "app:app", "--log-level", "debug", "--enable-stdio-inheritance"]

View File

@ -3,6 +3,8 @@ podman build --tag alipricetrack_cron:1.0.0 -f ./CronDockerfile
podman pod create --name aliexpress -p 8086:8080 podman pod create --name aliexpress -p 8086:8080
podman run -d --pod=aliexpress --name ali-selenium-firefox docker.io/selenium/standalone-firefox:latest
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres
podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/cookies.json:/app/cookies.json:Z --name ali-app alipricetrack:1.0.0 podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/cookies.json:/app/cookies.json:Z --name ali-app alipricetrack:1.0.0

View File

@ -4,3 +4,4 @@ gunicorn
psycopg2-binary psycopg2-binary
requests requests
pyyaml pyyaml
selenium

View File

@ -1,83 +1,250 @@
#!/usr/bin/python #!/usr/bin/python
import requests, re, json, os, yaml, time import requests, re, json, os, yaml, time
from db import * from db import *
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
def load_cookies_from_file(file_path): from selenium.webdriver.firefox.options import Options
'''load cookies from a file and return a dict usable in a request session''' from time import sleep
import random
def load_cookies_from_file_selenium(file_path):
'''Load cookies from a file and return a list usable in a browser session'''
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
cookies_data = json.load(file) cookies_data = json.load(file)
cookies_dict = {} cookies_list = []
for cookie_data in cookies_data: for cookie_data in cookies_data:
name_raw = cookie_data.get("Name raw", "") name_raw = cookie_data.get("Name raw", "")
content_raw = cookie_data.get("Content raw", "") content_raw = cookie_data.get("Content raw", "")
cookie_value = f"{content_raw}" cookie_value = f"{content_raw}"
cookie_path = cookie_data.get("Path raw", "")
cookie_dict = {"name": name_raw, "value": content_raw, "path": cookie_path}
if len(cookie_value) > 0: if len(cookie_value) > 0:
cookies_dict[name_raw] = cookie_value cookies_list.append(cookie_dict)
return cookies_dict return cookies_list
def check_item(settings_item): def check_item(settings_item):
''' '''
return a dict with items data extracted from aliexpress. return a dict with items data extracted from aliexpress.
extracted data: extracted data:
skuId, quantity, discount_percentage, price, currency, choice_delivery, image skuId, quantity, discount_percentage, price, currency, choice_delivery, image
parameter settings_item is a list of tables (string(itemid), attributes) parameter settings_item is a list (string(itemid), attributes)
itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter. itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter.
''' '''
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)|(FAIL_SYS_ILLEGAL_ACCESS)')
number_regex = re.compile(r'[0-9]+')
price_regex = re.compile(r'[0-9]*,[0-9]{0,2}')
item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}')
choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"')
magnifier_image_regex = re.compile(r'<meta property=\"og:image\" content=\"(https:[0-9a-zA-Z\/\-\_.]*)\"\/>')
punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)')
session = requests.Session() ##### to use with selenium firefox container
driver = webdriver.Remote(
command_executor='http://127.0.0.1:4444',
options=webdriver.FirefoxOptions()
)
##### for testing with local geckodriver
# options=Options()
# service = webdriver.FirefoxService(executable_path='/bin/geckodriver')
# profile = webdriver.FirefoxProfile()
# options.profile = profile
# driver = webdriver.Firefox(service=service, options=options)
# load login cookies
cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json' cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json'
cookies = load_cookies_from_file(cookies_file_path) cookies = load_cookies_from_file_selenium(cookies_file_path)
session.cookies.update(cookies) driver.get("https://aliexpress.com")
sleep(random.uniform(3, 6))
driver.delete_all_cookies()
for cookie in cookies:
driver.add_cookie(cookie)
sleep(random.uniform(3, 6))
# accept cookies
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "global-gdpr-btn-wrap"))
)
accept_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, "btn-accept"))
)
sleep(random.uniform(2, 4))
accept_button.click()
print("Cookies accepted")
except Exception as e:
print(f"An error occurred: {e}")
extract = dict()
print(settings_item) print(settings_item)
item = settings_item[0] item = settings_item[0]
filter_attributes = settings_item[1] filter_attributes = settings_item[1]
url = 'https://aliexpress.com/item/'+str(item)+'.html' url = 'https://aliexpress.com/item/'+str(item)+'.html'
target_page_response = session.get(url)
if target_page_response.status_code == 200:
punish = bool(re.search(punish_regex, target_page_response.text)) driver.get(url)
# check if punish page hit
punish = bool(re.search(punish_regex, driver.page_source))
if punish: if punish:
print("punish page")
driver.quit()
raise ValueError("punish") raise ValueError("punish")
content = re.findall(item_regex, target_page_response.text)
is_choice = bool(re.search(choice_regex, target_page_response.text))
for elem in content:
if set(get_attributes(elem[0])) == set(filter_attributes):
key = (item,tuple(filter_attributes))
discount = 0 if len(elem[7]) == 0 else int(elem[7])
price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13])
# get item image
image_link = re.findall(magnifier_image_regex, target_page_response.text)[0]
for attr in filter_attributes:
image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"')
image = re.findall(image_regex, target_page_response.text)
if len(image)>0:
image_link = image[0]
image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', image_link) # get bigger image instead of preview
break
# get currency
currency = elem[12] if (len(elem[12])>0) else elem[14]
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": currency, "choice_delivery": is_choice, "image": image_link}
else: else:
print(f'Failed to fetch target page. Status code: {target_page_response.status_code}') # refresh page to have the price in a single span
driver.refresh()
# click on each attribute
for attribute in filter_attributes:
if bool(re.search(" ", attribute)):
possible_attribute = [attribute, re.sub(" ", " ", attribute)]
else:
possible_attribute = [attribute]
for attr in possible_attribute:
try:
# try to find an image with an alt text corresponding to the attribute
img_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f"//img[@alt='{attr}']"))
)
location = img_element.location
size = img_element.size
# click on where the image is (images appear to always be 60x60px)
center_x = (location['x'] + size['width'] / 2) + random.uniform(-10,10)
center_y = (location['y'] + size['height'] / 2) + random.uniform(-10,10)
sleep(random.uniform(2, 4))
actions = ActionChains(driver)
actions.move_by_offset(center_x, center_y).click().perform()
print(f"clicked on {attr}")
break
except Exception as e:
try:
# try to find a div with corresponding text instead
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f"//div[@title='{attr}']//span[text()='{attr}']"))
)
sleep(random.uniform(2, 4))
actions = ActionChains(driver)
actions.move_to_element(div_element).click().perform()
print(f"clicked on {attr}")
break
except Exception as e:
print(f"Div or image {attr} not found: {e}")
### scrapping data
# price and currency
try:
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'product-price-current')]"))
)
span_element = div_element.find_element(By.XPATH, ".//span[contains(@class, 'product-price-value')]")
price_text = span_element.text
price = float(re.sub(",", ".", re.findall(price_regex, price_text)[0]))
currency = re.sub(" ", "", re.sub(price_regex, "", price_text))
print(f"The extracted price is: {price}, the extracted currency is: {currency}")
except Exception as e:
print(f"An error occurred: {e}")
# discount percentage
discount = 0
try:
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'price--original--wEueRiZ')]"))
)
span_element = div_element.find_element(By.XPATH, ".//span[contains(@class, 'price--discount--Y9uG2LK')]")
discount_text = span_element.text
discount = re.findall(number_regex, discount_text)[0]
print(f"The extracted discount is: {discount}")
except Exception as e:
print(f"An error occurred: {e}")
# quantity
quantity = '0'
try:
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'quantity--info--jnoo_pD')]"))
)
span_element = div_element.find_element(By.XPATH, ".//span")
quantity_text = span_element.text
quantity = re.findall(number_regex, quantity_text)[0]
print(f"The extracted quantity is: {quantity}")
except Exception as e:
print(f"An error occurred: {e}")
# image link
try:
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'magnifier--wrap--cF4cafd')]"))
)
img_element = div_element.find_element(By.XPATH, ".//img[contains(@class, 'magnifier--image--EYYoSlr')]")
img_src = img_element.get_attribute("src")
image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', img_src) # get bigger image instead of preview
print(f"The extracted image URL is: {image_link}")
except Exception as e:
try:
# if the image in the magnifier wrap doesn't exist it might be a video instead (image is video preview)
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'video--wrap--EhkqzuR')]"))
)
img_element = div_element.find_element(By.XPATH, ".//video[contains(@class, 'video--video--lsI7y97')]")
img_src = img_element.get_attribute("poster")
image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', img_src) # get bigger image instead of preview
print(f"The extracted image URL is: {image_link}")
except Exception as e:
print(f"An error occurred: {e}")
# choice
choice = False
try:
div_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'choice-mind--box--fJKH05M')]"))
)
img_element = div_element.find_element(By.XPATH, ".//img")
if img_element.get_attribute("src") == "https://ae01.alicdn.com/kf/S2eef439ead604da69a385490d86118c97/98x42.png":
choice = True
except Exception as e:
print(f"An error occurred: {e}")
print(f"Choice delivery: {choice}")
# return the data scraped
extract = dict()
key = (item,tuple(filter_attributes))
# skuId, quantity, discount_percentage, price, currency, choice_delivery, image
extract[key] = {"skuid": '0', "quantity": quantity, "discount_percentage": discount, "price": price, "currency": currency, "choice_delivery": choice, "image": image_link}
print(extract)
driver.quit()
return extract return extract
def get_attributes(attributes_raw): def get_attributes(attributes_raw):
@ -124,6 +291,3 @@ def retry_update(retry_list):
retry.append(item) retry.append(item)
return retry return retry

View File

@ -7,7 +7,7 @@ from aliexpress import *
if __name__ == '__main__': if __name__ == '__main__':
# c_l = ["33058732737", ["1.2 M Probe"]] # c_l = ["33058732737", ["3 M Probe"]]
# print(check_item(c_l)) # print(check_item(c_l))
# c_l = ["1005005769229528", ["2 E27 Clip EU"]] # c_l = ["1005005769229528", ["2 E27 Clip EU"]]
# print(check_item(c_l)) # print(check_item(c_l))
@ -23,17 +23,17 @@ if __name__ == '__main__':
# #
# c_l = ["1005005967514183", ["1KG-White"]] # c_l = ["1005005967514183", ["1KG-White"]]
# print(check_item(c_l)) # print(check_item(c_l))
# # #
# # #
# c_l = ["1005006062371246", ["Pr Tactile 62g x35"]] # c_l = ["1005006062371246", ["Pr Tactile 62g x35"]]
# print(check_item(c_l)) # print(check_item(c_l))
# c_l = ["1005005676358693", ["5x250mm 5pcs"]] c_l = ["1005005676358693", ["5x250mm 5pcs"]]
# print(check_item(c_l))
c_l = ["1005005824413309", ["00350"]]
print(check_item(c_l)) print(check_item(c_l))
# c_l = ["1005005824413309", ["00350"]]
# print(check_item(c_l))
# #
# c_l = ["1005005777900699", ["Black"]] # c_l = ["1005005777900699", ["Black"]]
# print(check_item(c_l)) # print(check_item(c_l))