From 59f7b31ea47cbcdab3653c8b0b71f651965ff1fb Mon Sep 17 00:00:00 2001 From: Sam Hadow Date: Sat, 20 Jul 2024 22:25:15 +0200 Subject: [PATCH] fixed scraper, use selenium instead of requests --- Dockerfile | 4 +- podman-commands | 2 + requirements.txt | 1 + src/aliexpress.py | 258 +++++++++++++++++++++++++++++++++++++--------- src/main.py | 14 +-- 5 files changed, 224 insertions(+), 55 deletions(-) diff --git a/Dockerfile b/Dockerfile index ff8465e..f9ab7d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,4 +20,6 @@ EXPOSE 8080 ENV POSTGRES_HOST=127.0.0.1 ENV POSTGRES_PORT=5432 -CMD ["gunicorn","--config", "gunicorn_config.py", "app:app"] +ENV PYTHONUNBUFFERED=1 + +CMD ["gunicorn","--config", "gunicorn_config.py", "app:app", "--log-level", "debug", "--enable-stdio-inheritance"] diff --git a/podman-commands b/podman-commands index a37b341..2a54280 100644 --- a/podman-commands +++ b/podman-commands @@ -3,6 +3,8 @@ podman build --tag alipricetrack_cron:1.0.0 -f ./CronDockerfile podman pod create --name aliexpress -p 8086:8080 +podman run -d --pod=aliexpress --name ali-selenium-firefox docker.io/selenium/standalone-firefox:latest + podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/db:/var/lib/postgresql/data:Z --name ali-db docker.io/postgres podman run -d --pod=aliexpress -e POSTGRES_DB="aliexpress" -e POSTGRES_USER="postgres" -e POSTGRES_PASSWORD="postgres" -v /home/data/podman/aliexpress/cookies.json:/app/cookies.json:Z --name ali-app alipricetrack:1.0.0 diff --git a/requirements.txt b/requirements.txt index 690bd24..e0d739a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ gunicorn psycopg2-binary requests pyyaml +selenium diff --git a/src/aliexpress.py b/src/aliexpress.py index 7274b54..27e5a3f 100644 --- a/src/aliexpress.py +++ b/src/aliexpress.py @@ -1,84 +1,251 @@ #!/usr/bin/python import requests, re, json, os, yaml, time from db import * +from selenium import webdriver +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.action_chains import ActionChains -def load_cookies_from_file(file_path): - '''load cookies from a file and return a dict usable in a request session''' +from selenium.webdriver.firefox.options import Options +from time import sleep +import random + + +def load_cookies_from_file_selenium(file_path): + '''Load cookies from a file and return a list usable in a browser session''' with open(file_path, 'r') as file: cookies_data = json.load(file) - cookies_dict = {} + cookies_list = [] for cookie_data in cookies_data: name_raw = cookie_data.get("Name raw", "") content_raw = cookie_data.get("Content raw", "") cookie_value = f"{content_raw}" + cookie_path = cookie_data.get("Path raw", "") + + cookie_dict = {"name": name_raw, "value": content_raw, "path": cookie_path} if len(cookie_value) > 0: - cookies_dict[name_raw] = cookie_value + cookies_list.append(cookie_dict) - return cookies_dict + return cookies_list def check_item(settings_item): ''' return a dict with items data extracted from aliexpress. extracted data: skuId, quantity, discount_percentage, price, currency, choice_delivery, image - parameter settings_item is a list of tables (string(itemid), attributes) + parameter settings_item is a list (string(itemid), attributes) itemid is in aliexpress link to item page. attributes is a list of string. Each string is a choice value (for example which length, or which colour) if multiple items are on the same page, only one by category, order doesn't matter. ''' + punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)|(FAIL_SYS_ILLEGAL_ACCESS)') + number_regex = re.compile(r'[0-9]+') + price_regex = re.compile(r'[0-9]*,[0-9]{0,2}') - item_regex = re.compile(r'skuAttr\\\":\\\"(([0-9]*:[0-9;:]*#?[a-zA-Z0-9 \.\-]*;?)*)?\\\",\\\"skuId\\\":([0-9]*),\\\"skuIdStr\\\":\\\"[0-9]*\\\",\\\"skuPropIds\\\":\\\"[0-9,]*\\\",\\\"skuVal\\\":{\\\"availQuantity\\\":([0-9]*),(\\\"bulkOrder\\\":([0-9]*),)?(\\\"discount\\\":\\\"([0-9]*)\\\",\\\"discountTips\\\":\\\"-[0-9]*%\\\",)?\\\"hideOriPrice\\\":(false|true),\\\"inventory\\\":([0-9]*),\\\"isActivity\\\":(true|false),\\\"optionalWarrantyPrice\\\":\[\],(\\\"skuActivityAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)},\\\"skuActivityAmountLocal\\\":\\\"[0-9]*,[0-9]*.\|[0-9]*\|[0-9]*\\\",)?\\\"skuAmount\\\":{\\\"currency\\\":\\\"([A-Z]*)\\\",\\\"formatedAmount\\\":\\\"[0-9]*,[0-9]*.\\\",\\\"value\\\":([0-9]*\.[0-9]*)}') - choice_regex = re.compile(r'businessModel\\\":\\\"CHOICE\\\"') - magnifier_image_regex = re.compile(r'') - punish_regex = re.compile(r'(pid: \'punish-page\')|(Deny from x5)') - session = requests.Session() + ##### to use with selenium firefox container + driver = webdriver.Remote( + command_executor='http://127.0.0.1:4444', + options=webdriver.FirefoxOptions() + ) + + ##### for testing with local geckodriver + # options=Options() + # service = webdriver.FirefoxService(executable_path='/bin/geckodriver') + # profile = webdriver.FirefoxProfile() + # options.profile = profile + # driver = webdriver.Firefox(service=service, options=options) + + # load login cookies cookies_file_path = os.path.dirname(os.path.realpath(__file__))+'/cookies.json' - cookies = load_cookies_from_file(cookies_file_path) - session.cookies.update(cookies) + cookies = load_cookies_from_file_selenium(cookies_file_path) + driver.get("https://aliexpress.com") + sleep(random.uniform(3, 6)) + driver.delete_all_cookies() + for cookie in cookies: + driver.add_cookie(cookie) + + sleep(random.uniform(3, 6)) + + + # accept cookies + try: + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "global-gdpr-btn-wrap")) + ) + accept_button = WebDriverWait(driver, 10).until( + EC.element_to_be_clickable((By.CLASS_NAME, "btn-accept")) + ) + sleep(random.uniform(2, 4)) + accept_button.click() + + print("Cookies accepted") + + except Exception as e: + print(f"An error occurred: {e}") + - extract = dict() print(settings_item) item = settings_item[0] filter_attributes = settings_item[1] url = 'https://aliexpress.com/item/'+str(item)+'.html' - target_page_response = session.get(url) - if target_page_response.status_code == 200: - punish = bool(re.search(punish_regex, target_page_response.text)) - if punish: - raise ValueError("punish") + driver.get(url) - content = re.findall(item_regex, target_page_response.text) - is_choice = bool(re.search(choice_regex, target_page_response.text)) - for elem in content: - if set(get_attributes(elem[0])) == set(filter_attributes): - key = (item,tuple(filter_attributes)) - discount = 0 if len(elem[7]) == 0 else int(elem[7]) - price = float(elem[15]) if len(elem[13]) == 0 else float(elem[13]) - - # get item image - image_link = re.findall(magnifier_image_regex, target_page_response.text)[0] - for attr in filter_attributes: - image_regex = re.compile(fr'\"propertyValueDefinitionName\":\"{attr}\",\"propertyValueIdLong\":[0-9]*,\"skuPropertyImageSummPath\":\"(https:\/\/[0-9a-zA-Z.\/\-\_]*)\"') - image = re.findall(image_regex, target_page_response.text) - if len(image)>0: - image_link = image[0] - image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', image_link) # get bigger image instead of preview - break - - # get currency - currency = elem[12] if (len(elem[12])>0) else elem[14] - - # skuId, quantity, discount_percentage, price, currency, choice_delivery, image - extract[key] = {"skuid": elem[2], "quantity": elem[3], "discount_percentage": discount, "price": price, "currency": currency, "choice_delivery": is_choice, "image": image_link} + # check if punish page hit + punish = bool(re.search(punish_regex, driver.page_source)) + if punish: + print("punish page") + driver.quit() + raise ValueError("punish") else: - print(f'Failed to fetch target page. Status code: {target_page_response.status_code}') + # refresh page to have the price in a single span + driver.refresh() + # click on each attribute + for attribute in filter_attributes: + if bool(re.search(" ", attribute)): + possible_attribute = [attribute, re.sub(" ", " ", attribute)] + else: + possible_attribute = [attribute] + for attr in possible_attribute: + try: + # try to find an image with an alt text corresponding to the attribute + img_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, f"//img[@alt='{attr}']")) + ) - return extract + location = img_element.location + size = img_element.size + + # click on where the image is (images appear to always be 60x60px) + center_x = (location['x'] + size['width'] / 2) + random.uniform(-10,10) + center_y = (location['y'] + size['height'] / 2) + random.uniform(-10,10) + + sleep(random.uniform(2, 4)) + actions = ActionChains(driver) + actions.move_by_offset(center_x, center_y).click().perform() + + print(f"clicked on {attr}") + break + except Exception as e: + try: + # try to find a div with corresponding text instead + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, f"//div[@title='{attr}']//span[text()='{attr}']")) + ) + + sleep(random.uniform(2, 4)) + actions = ActionChains(driver) + actions.move_to_element(div_element).click().perform() + + print(f"clicked on {attr}") + break + except Exception as e: + print(f"Div or image {attr} not found: {e}") + + ### scrapping data + # price and currency + try: + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'product-price-current')]")) + ) + span_element = div_element.find_element(By.XPATH, ".//span[contains(@class, 'product-price-value')]") + + price_text = span_element.text + price = float(re.sub(",", ".", re.findall(price_regex, price_text)[0])) + currency = re.sub(" ", "", re.sub(price_regex, "", price_text)) + + print(f"The extracted price is: {price}, the extracted currency is: {currency}") + except Exception as e: + print(f"An error occurred: {e}") + + # discount percentage + discount = 0 + try: + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'price--original--wEueRiZ')]")) + ) + span_element = div_element.find_element(By.XPATH, ".//span[contains(@class, 'price--discount--Y9uG2LK')]") + + discount_text = span_element.text + discount = re.findall(number_regex, discount_text)[0] + + print(f"The extracted discount is: {discount}") + except Exception as e: + print(f"An error occurred: {e}") + + # quantity + quantity = '0' + try: + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'quantity--info--jnoo_pD')]")) + ) + span_element = div_element.find_element(By.XPATH, ".//span") + + quantity_text = span_element.text + quantity = re.findall(number_regex, quantity_text)[0] + + print(f"The extracted quantity is: {quantity}") + except Exception as e: + print(f"An error occurred: {e}") + + + # image link + try: + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'magnifier--wrap--cF4cafd')]")) + ) + + img_element = div_element.find_element(By.XPATH, ".//img[contains(@class, 'magnifier--image--EYYoSlr')]") + + img_src = img_element.get_attribute("src") + image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', img_src) # get bigger image instead of preview + + print(f"The extracted image URL is: {image_link}") + except Exception as e: + try: + # if the image in the magnifier wrap doesn't exist it might be a video instead (image is video preview) + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'video--wrap--EhkqzuR')]")) + ) + + img_element = div_element.find_element(By.XPATH, ".//video[contains(@class, 'video--video--lsI7y97')]") + + img_src = img_element.get_attribute("poster") + image_link = re.sub(r'(jpg|png)_[0-9]+x[0-9]+', r'\1_800x800', img_src) # get bigger image instead of preview + + print(f"The extracted image URL is: {image_link}") + except Exception as e: + print(f"An error occurred: {e}") + + # choice + choice = False + try: + div_element = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'choice-mind--box--fJKH05M')]")) + ) + img_element = div_element.find_element(By.XPATH, ".//img") + if img_element.get_attribute("src") == "https://ae01.alicdn.com/kf/S2eef439ead604da69a385490d86118c97/98x42.png": + choice = True + except Exception as e: + print(f"An error occurred: {e}") + print(f"Choice delivery: {choice}") + + + + # return the data scraped + extract = dict() + key = (item,tuple(filter_attributes)) + # skuId, quantity, discount_percentage, price, currency, choice_delivery, image + extract[key] = {"skuid": '0', "quantity": quantity, "discount_percentage": discount, "price": price, "currency": currency, "choice_delivery": choice, "image": image_link} + print(extract) + driver.quit() + return extract def get_attributes(attributes_raw): '''return a list of attributes from attributes raw string''' @@ -124,6 +291,3 @@ def retry_update(retry_list): retry.append(item) return retry - - - diff --git a/src/main.py b/src/main.py index 7acd444..10bbc6a 100644 --- a/src/main.py +++ b/src/main.py @@ -7,7 +7,7 @@ from aliexpress import * if __name__ == '__main__': - # c_l = ["33058732737", ["1.2 M Probe"]] + # c_l = ["33058732737", ["3 M Probe"]] # print(check_item(c_l)) # c_l = ["1005005769229528", ["2 E27 Clip EU"]] # print(check_item(c_l)) @@ -23,17 +23,17 @@ if __name__ == '__main__': # # c_l = ["1005005967514183", ["1KG-White"]] # print(check_item(c_l)) - # - # + # # + # # # c_l = ["1005006062371246", ["Pr Tactile 62g x35"]] # print(check_item(c_l)) - # c_l = ["1005005676358693", ["5x250mm 5pcs"]] - # print(check_item(c_l)) - - c_l = ["1005005824413309", ["00350"]] + c_l = ["1005005676358693", ["5x250mm 5pcs"]] print(check_item(c_l)) + # c_l = ["1005005824413309", ["00350"]] + # print(check_item(c_l)) + # # c_l = ["1005005777900699", ["Black"]] # print(check_item(c_l))