This is a scraper that runs on Morph. To get started see the documentation
To download data sign in with GitHub
rows 10 / 12402
productid | url | domain | price | salesprice | domainmisc | prodlogurls | prodlogurl | finalimgurls | validimgurls | imgurls | notfound | notavailable | removeon404 | soldoutfix | soldouthtmlfix | catstoaddresult | attributes | sizetypemapsqls | filusid | filus |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
21850
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21853
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21856
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21859
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21862
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21865
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21868
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21871
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21874
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
|
21877
|
www.shopalexis.com
|
1
|
|
""
|
""
|
|
""
|
""
|
""
|
true
|
false
|
true
|
false
|
false
|
""
|
[{"name": "Brand", "options": [[{"term_id": 536, "name": "Alexis", "slug": "brand-alexis", "taxonomy": "pa_brand"}, false]], "position": 1, "visible": 1, "variation": 1}, {"name": "Sex", "options": [[{"term_id": 142, "name": "Female", "slug": "female", "taxonomy": "pa_sex"}, false]], "position": 2, "visible": 1, "variation": 1}]
|
["", "", "", ""]
|
|
|
To download data sign in with GitHub
rows 1 / 1
file_id | file_cont |
---|---|
1
|
["#!/usr/bin/env python\n", "# -*- coding: utf-8 -*- \n", "\n", "# /|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\/|\\ \n", "# < - Brandhunt Product Update Scraper - >\n", "# \\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\\|/\n", "\n", "# --- IMPORT SECTION --- #\n", "\n", "import os\n", "os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'\n", "\n", "import scraperwiki\n", "from lxml import etree\n", "import lxml.html\n", "import requests\n", "import json\n", "import base64\n", "#import mysql.connector\n", "import re\n", "###import random\n", "from selenium import webdriver\n", "###from seleniumwire import webdriver\n", "#from selenium.webdriver.common.keys import Keys\n", "from selenium.common.exceptions import TimeoutException\n", "from selenium.common.exceptions import WebDriverException\n", "from selenium.webdriver.chrome.options import Options\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from slugify import slugify\n", "from splinter import Browser\n", "import sys\n", "import time\n", "import traceback\n", "from translate import Translator\n", "#from urllib2 import HTTPError\n", "from urllib.error import HTTPError\n", "try:\n", " from urllib.parse import urljoin\n", "except ImportError:\n", " from urlparse import urljoin\n", "\n", "# --- FUNCTION SECTION --- #\n", "\n", "# *** --- Replacement for PHP's array merge functionality --- *** #\n", "def array_merge(array1, array2):\n", " if isinstance(array1, list) and isinstance(array2, list):\n", " return array1 + array2\n", " elif isinstance(array1, dict) and isinstance(array2, dict):\n", " return dict(list(array1.items()) + list(array2.items()))\n", " elif isinstance(array1, set) and isinstance(array2, set):\n", " return array1.union(array2)\n", " return False\n", "\n", "# *** --- For checking if a certain product attribute exists --- *** #\n", "def doesprodattrexist(prodattrlist, term, taxonomy):\n", " for prodattr in prodattrlist:\n", " if prodattr['term_id'] == term or prodattr['name'] == term or prodattr['slug'] == term:\n", " return prodattr\n", " return 0\n", " \n", "# *** --- Custom substitute for adding together attributes variables --- *** #\n", "def add_together_attrs(attrlist1, attrlist2, prodattr, jsonprodattr):\n", " newattrs=list((a for a in attrlist1 if a[0]['term_id'] == -1))\n", " oldattrs=list((a[0]['term_id'] for a in attrlist1 if a[0]['term_id'] > -1))\n", " attrlist2=list((a[0]['term_id'] for a in attrlist2))\n", " #print('newattrs: ' + json.dumps(list(newattrs)))\n", " #print('oldattrs: ' + json.dumps(list(oldattrs)))\n", " #filtattrs = oldattrs + attrlist2\n", " filtattrs = list(set(oldattrs) | set(attrlist2)) \n", " #print('filtattrs: ' + json.dumps(list(filtattrs)))\n", " for flt in filtattrs:\n", " flt = doesprodattrexist(jsonprodattr[prodattr], flt, prodattr)\n", " if flt != 0:\n", " newattrs.append((flt, False))\n", " #print('finalattr: ' + json.dumps(list(finalattr)))\n", " return newattrs\n", " \n", "# *** --- For getting proper value from scraped HTML elements --- *** #\n", "def getmoneyfromtext(price):\n", " val = re.sub(r'\\.(?=.*\\.)', '', price.replace(',', '.'))\n", " if not val: return val\n", " #if not re.match('[0-9]', val): return ''\n", " else: return '{:.0f}'.format(float(re.sub(r'[^0-9,.]', '', val)))\n", " \n", "# *** --- For converting scraped price to correct value according to wanted currency --- *** #\n", "def converttocorrectprice(price, currencysymbol, headers):\n", " r = requests.get('https://api.exchangeratesapi.io/latest?base=' + currencysymbol + '', headers=headers)\n", " json = r.json()\n", " jsonrates = json['rates']\n", " foundinrates = False\n", " for ratekey, ratevalue in jsonrates.items():\n", " if price.find('' + ratekey + '') != -1:\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " #print('CURRENCY: ' + currencysymbol)\n", " #print('PRICE: ' + price)\n", " #print('RATEKEY: ' + ratekey)\n", " #print('RATEVALUE: ' + str(ratevalue))\n", " price = float(price) / ratevalue\n", " price = getmoneyfromtext(str(price))\n", " foundinrates = True\n", " break\n", " if not foundinrates:\n", " if price.find(u'$') != -1:\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " price = float(price) / jsonrates['USD']\n", " price = getmoneyfromtext(str(price))\n", " elif price.find(u'\u00a3') != -1:\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " price = float(price) / jsonrates['GBP']\n", " price = getmoneyfromtext(str(price))\n", " elif price.find(u'\u20ac') != -1:\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " price = float(price) / jsonrates['EUR']\n", " price = getmoneyfromtext(str(price))\n", " else:\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " #print(\"CONVERTEDPRICE:\" + price)\n", " return price\n", "\n", "# *** --- For grabbing URLs from text-based values/strings --- *** #\n", "def graburls(text, imageonly):\n", " try:\n", " imgsuffix = ''\n", " if imageonly:\n", " imgsuffix = '\\.(gif|jpg|jpeg|png|svg|webp)'\n", " else:\n", " imgsuffix = '\\.([a-zA-Z0-9\\&\\.\\/\\?\\:@\\-_=#])*'\n", " finalmatches = []\n", " # --> For URLs without URL encoding characters:\n", " matches = re.finditer(r'((http|https)\\:\\/\\/)?[a-zA-Z0-9\\.\\/\\?\\:\\~@\\-_=#]+' + imgsuffix + '', text)\n", " for match in matches:\n", " finalmatches.append(match.group())\n", " #print('URLNOENCODEMATCHES:')\n", " #for match in matches: print(match)\n", " # --> For URLs - with - URL encoding characters:\n", " matches = re.finditer(r'((http|https)\\:\\/\\/)?[a-zA-Z0-9\\.\\/\\?\\\\%:\\~@\\-_=#]+' + imgsuffix + '', text)\n", " for match in matches:\n", " finalmatches.append(match.group())\n", " #print('URLNOENCODEMATCHES:')\n", " #for match in matches: print(match)\n", " #print('FINALMATCHES')\n", " #for match in finalmatches: print(match)\n", " finalmatches = list(set(finalmatches))\n", " return { i : finalmatches[i] for i in range(0, len(finalmatches)) }\n", " except:\n", " print('Error grabbing urls!')\n", " return []\n", " \n", "# *** --- For converting relative URLs to absolute URLs --- *** #\n", "def reltoabs(relurl, baseurl):\n", " pass\n", "\n", "# --> Decode and handle these URLs!\n", "\n", "def mainfunc(maxlimit):\n", " \n", " # --> First, check if the database should be reset:\n", "\n", " #if bool(os.environ['MORPH_RESET_DB']):\n", " # if scraperwiki.sql.select('* from data'):\n", " # scraperwiki.sql.execute('DELETE FROM data')\n", "\n", " #from pathlib import Path\n", " #print(\"File Path:\", Path(__file__).absolute())\n", " #print(\"Directory Path:\", Path().absolute())\n", "\n", " #import os\n", " #os.chmod('/usr/local/bin/chromedriver', 755)\n", "\n", " #optionuls = webdriver.ChromeOptions()\n", " #optionuls.add_argument('--headless')\n", " #optionuls.add_argument('--disable-dev-shm-usage')\n", " #optionuls.add_argument('--no-sandbox')\n", " #browsur = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver',options=optionuls, service_args=[\"--verbose\"])\n", " #browsur.set_window_size(1920, 1080)\n", " #browsur.get('https://www.nonspecificwebsite.com')\n", "\n", " # --> Connect to Wordpress Site via REST API and get all the proper URLs to be scraped!\n", "\n", " wp_username = os.environ['MORPH_WP_USERNAME']\n", " wp_password = os.environ['MORPH_WP_PASSWORD']\n", " wp_connectwp_url = os.environ['MORPH_WP_CONNECT_URL']\n", " wp_connectwp_url_2 = os.environ['MORPH_WP_CONNECT_URL_2']\n", " wp_connectwp_url_3 = os.environ['MORPH_WP_CONNECT_URL_3']\n", " wp_connectwp_url_4 = os.environ['MORPH_WP_CONNECT_URL_4']\n", " wp_connectwp_url_5 = os.environ['MORPH_WP_CONNECT_URL_5']\n", " wp_connectwp_url_6 = os.environ['MORPH_WP_CONNECT_URL_6']\n", " wp_connectwp_url_7 = os.environ['MORPH_WP_CONNECT_URL_7']\n", "\n", " encodestring = wp_username + ':' + wp_password;\n", " #token = base64.standard_b64encode(wp_username + ':' + wp_password)\n", " token = base64.b64encode(encodestring.encode())\n", " headers = {'Authorization': 'Basic ' + token.decode('ascii')}\n", "\n", " amount_processed = 0\n", " offset = int(os.environ['MORPH_START_OFFSET'])\n", " limit = 25\n", "\n", " #r = requests.get(wp_connectwp_url, headers=headers)\n", " r = requests.get(wp_connectwp_url + str(offset) + '/' + str(limit) + '/', headers=headers)\n", " #jsonprods = r.json()\n", " jsonprods = json.loads(r.content)\n", "\n", " r = requests.get(wp_connectwp_url_2, headers=headers)\n", " jsonwebsites = json.loads(r.content)\n", "\n", " r = requests.get(wp_connectwp_url_3, headers=headers)\n", " jsonprodattr = json.loads(r.content)\n", "\n", " r = requests.get(wp_connectwp_url_4, headers=headers)\n", " jsoncatsizetypemaps = json.loads(r.content)\n", "\n", " r = requests.get(wp_connectwp_url_5, headers=headers)\n", " jsoncatmaps = json.loads(r.content)\n", "\n", " r = requests.get(wp_connectwp_url_6, headers=headers)\n", " jsonsizemaps = json.loads(r.content)\n", " \n", " r = requests.get(wp_connectwp_url_7, headers=headers)\n", " jsonprodfixes = json.loads(r.content)\n", "\n", " # --> Get the proxy information and related modules!\n", "\n", " ###wonpr_token = os.environ['MORPH_WONPR_API_TOKEN']\n", " ###wonpr_url = os.environ['MORPH_WONPR_CONNECT_URL']\n", " ###wonpr_secret_key = os.environ['MORPH_WONPR_SECRET_KEY']\n", " ###wonpr_user = os.environ['MORPH_WONPR_USERNAME']\n", " ###wonpr_pass = os.environ['MORPH_WONPR_PASSWORD']\n", " ###\n", " ###encodestring2 = wonpr_token + ':'\n", " ###token2 = base64.b64encode(encodestring2.encode())\n", " ###wonpr_headers = {'Authorization': 'Basic ' + token2.decode('ascii')}\n", " ###\n", " ###r = requests.get(wonpr_url, headers=wonpr_headers)\n", " ###jsonproxies = json.loads(r.content)\n", " ###finalproxies = []\n", "\n", " #print(jsonproxies)\n", "\n", " ###for proxy in jsonproxies:\n", " ### if proxy['server'] == 'stockholm' or proxy['server'] == 'gothenburg':\n", " ### for ip in proxy['ips']:\n", " ### if ip['status'] == 'ok':\n", " ### finalproxies.append(proxy['hostname'] + ':1100' + str(ip['port_base']))\n", " ### break\n", " ### \n", " ###proxies = []\n", " ###if finalproxies:\n", " ### randomproxy = random.choice(finalproxies)\n", " ### proxies = {'http': 'http://' + wonpr_user + ':' + wonpr_pass + '@' + randomproxy,\n", " ### 'https': 'https://' + wonpr_user + ':' + wonpr_pass + '@' + randomproxy,\n", " ### 'no_proxy': 'localhost,127.0.0.1'}\n", " \n", " #arraus = []\n", " totalscrapedcount = 0\n", " while jsonprods:\n", " for website in jsonwebsites:\n", " # Should we ignore the current website? #\n", " if website['ignorethisone'] == '1':\n", " continue\n", " # Check if there are any initial values to take care of! #\n", " override_timeout = ''\n", " altimggrab = ''\n", " skip_from_img_url = ''\n", " translate_pa_category_html = '' \n", " orig_prodmisc = ''\n", " if website['productmisc'] != '':\n", " orig_prodmisc = website['productmisc']\n", " intro_output = re.search(r'({translate_pa_category_html}(.*?))\\{', website['productmisc'])\n", " if intro_output is not None and len(intro_output.group(1)) > 0:\n", " translate_pa_category_html = intro_output.group(2)\n", " website['productmisc'] = re.sub(r'({translate_pa_category_html}.*?(?=\\{))', '', website['productmisc'])\n", " intro_output = re.search(r'({override_timeout}(.*?))\\{', website['productmisc'])\n", " if intro_output is not None and len(intro_output.group(1)) > 0:\n", " override_timeout = intro_output.group(2)\n", " website['productmisc'] = re.sub(r'({override_timeout}.*?(?=\\{))', '', website['productmisc'])\n", " intro_output = re.search(r'({alt_img_grab}(.*?))\\{', website['productmisc'])\n", " if intro_output is not None and len(intro_output.group(1)) > 0:\n", " altimggrab = '1'\n", " website['productmisc'] = re.sub(r'({alt_img_grab}.*?(?=\\{))', '', website['productmisc'])\n", " intro_output = re.search(r'({alt_img_grab_2}(.*?))\\{', website['productmisc'])\n", " if intro_output is not None and len(intro_output.group(1)) > 0:\n", " altimggrab = '2'\n", " website['productmisc'] = re.sub(r'({alt_img_grab_2}.*?(?=\\{))', '', website['productmisc'])\n", " intro_output = re.search(r'({skip_from_img_url}(.*?))\\{', website['productmisc'])\n", " if intro_output is not None and len(intro_output.group(1)) > 0:\n", " skip_from_img_url = intro_output.group(2)\n", " website['productmisc'] = re.sub(r'({skip_from_img_url}.*?(?=\\{))', '', website['productmisc'])\n", " # Check each product - See if any of them belong to the current website! #\n", " for product in jsonprods:\n", " if website['domain'] == product['domain']:\n", " # --- First, get the HTML for each domain part --- #\n", " if website['scrapetype'] == 'phantomjs_morph_io':\n", " try:\n", " #with Browser(\"phantomjs\") as browser:\n", " ##chrome_options = Options()\n", " #chrome_options.add_argument(\"--headless\")\n", " #chrome_options.add_argument(\"--disable-gpu\")\n", " ##chrome_options.add_argument('--no-sandbox')\n", " ##chrome_options.add_argument('--disable-extensions')\n", " #chrome_options.binary_location = '/usr/bin/google-chrome'\n", " #browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', chrome_options=chrome_options)\n", " #with Browser('chrome', headless=True, options=chrome_options) as browser:\n", " optionals = Options()\n", " #optionals.add_argument('--no-proxy-server')\n", " #optionals.add_argument(\"--proxy-server='direct://'\")\n", " #optionals.add_argument(\"--proxy-bypass-list=*\")\n", " optionals.add_argument('--disable-dev-shm-usage')\n", " optionals.add_argument('--disable-extensions')\n", " optionals.add_argument('--no-sandbox')\n", " optionals.add_argument(\"--headless\")\n", " ###optionals.add_argument(\"--proxy-server=\" + proxies['https'] + \"\")\n", " #optionals.add_argument('--lang=en_US') \n", " #optionals.add_argument('--lang=sv')\n", " optionals.add_experimental_option('prefs', {'intl.accept_languages': 'sv',\n", " 'profile.default_content_setting_values.geolocation': 1,\n", " \"profile.default_content_settings.geolocation\": 1})\n", " ###optionals_wire = { 'proxy': proxies }\n", " #optionals.add_argument('--disable-gpu')\n", " #optionals.add_argument('--ignore-certificate-errors')\n", " #optionals.add_argument(\"--start-maximized\") \n", " #optionals.add_argument(\"disable-infobars\") \n", " html_source = ''\n", " root = ''\n", " #with Browser('chrome', headless=True, options=optionals, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) as browser:\n", " #with Browser('chrome', headless=True, options=optionals) as browser:\n", " ###with webdriver.Chrome(options=optionals, seleniumwire_options=optionals_wire) as driver:\n", " with webdriver.Chrome(options=optionals) as driver:\n", " #browser.driver.close()\n", " #browser.driver = webdriver.Chrome(options=optionals, seleniumwire_options=optionals_wire)\n", " #browser = webdriver.Chrome(options=optionals, service_args=[\"--verbose\"])\n", " driver.set_page_load_timeout(300)\n", " driver.set_window_size(1920, 1080)\n", " params = {\"latitude\": 59.3521,\n", " \"longitude\": 18.0041,\n", " \"accuracy\": 100}\n", " response = driver.execute_cdp_cmd(\"Page.setGeolocationOverride\", params)\n", " # submit the search form...\n", " ##browser.fill(\"q\", \"parliament\")\n", " ##button = browser.find_by_css(\"button[type='submit']\")\n", " ##button.click()\n", " # Scrape the data you like...\n", " ##links = browser.find_by_css(\".search-results .list-group-item\")\n", " ##for link in links:\n", " ## print link['href']\n", " # >>> VISIT THE PAGE THROUGH BROWSER <<< #\n", " try:\n", " #browser.driver.implicitly_wait(30) # seconds\n", " driver.get(product['url'])\n", " #myDynamicElement = browser.driver.find_element_by_id(\"attribute135\")\n", " #time.sleep(25)\n", " if override_timeout != '':\n", " time.sleep(int(override_timeout))\n", " else:\n", " time.sleep(2)\n", " #browser.driver.refresh()\n", " html_source = driver.page_source\n", " #html_source = browser.driver.page_source\n", " #html_source = browser.driver.execute_script('return document.documentElement.outerHTML')\n", " driver.quit()\n", " #browser.get(product['url'])\n", " #print(\"HTML:\")\n", " #print(html_source)\n", " #print(browser.driver.page_source)\n", " except HTTPError as err:\n", " if err.code == 302:\n", " try:\n", " url_headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\\\n", " 'User-Agent':'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',\\\n", " 'Accept-Encoding':'gzip, deflate',\\\n", " 'Accept-Language':'en-US,en;q=0.8'}\n", " url_session = requests.session()\n", " response = url_session.get(url=product['url'], headers=url_headers)\n", " html = response.content\n", " except:\n", " print(traceback.format_exc())\n", " elif err.code == 404:\n", " notfound = True\n", " removeon404 = False\n", " if website['productmisc']:\n", " if website['productmisc'].find('allow_remove_on_404'):\n", " removeon404 = True\n", " try:\n", " scraperwiki.sqlite.save(unique_keys=['productid'],\\\n", " data={'productid': product['productid'],\\\n", " 'url': product['url'],\\\n", " 'domain': product['domain'],\\\n", " 'price': '',\\\n", " 'salesprice': '',\\\n", " 'domainmisc': '',\\\n", " 'prodlogurls': '',\\\n", " 'prodlogurl': '',\\\n", " 'finalimgurls': '',\\\n", " 'validimgurls': '',\\\n", " 'imgurls': '',\\\n", " 'notfound': notfound,\\\n", " 'notavailable': True,\\\n", " 'removeon404': removeon404,\\\n", " 'soldoutfix': '',\\\n", " 'soldouthtmlfix': '',\\\n", " 'catstoaddresult': '',\\\n", " 'attributes': '',\\\n", " 'sizetypemapsqls': ''})\n", " totalscrapedcount = totalscrapedcount + 1\n", " continue\n", " except:\n", " print(traceback.format_exc())\n", " continue\n", " else:\n", " raise\n", " except WebDriverException:\n", " print('Chrome not reachable - The product will be rescraped again!')\n", " jsonprods.append(product)\n", " time.sleep(2)\n", " continue\n", " except:\n", " #print(\"Error when scraping URL for product ID \" + product['productid'] + \": \" + str(sys.exc_info()[0]) + \" occured!\")\n", " print(traceback.format_exc())\n", " print(\"Currently scraping product with ID \" + str(product['productid']))\n", " # >>> GET THE HTML ROOT <<< #\n", " root = lxml.html.fromstring(html_source)\n", " # >>> GET THE PRICE <<< #\n", " price_elements = ''\n", " price = ''\n", " #print(website['priceselector'])\n", " try:\n", " website['priceselector'] = website['priceselector'].encode().decode(\"unicode-escape\")\n", " #print(website['priceselector'])\n", " if website['priceselector'].find('[multiple],') != -1:\n", " website['priceselector'].replace('[multiple],', '')\n", " #price_elements = browser.find_by_css(website['priceselector'])\n", " price_elements = root.cssselect(website['priceselector'])\n", " for el in price_elements:\n", " if el is None:\n", " continue\n", " price = price + el.text + ' '\n", " if price != '':\n", " price = re.sub(r'([^a-zA-Z]\\w+\\%+)', '', price)\n", " else:\n", " #price_elements = browser.find_by_css(website['priceselector'])\n", " price_elements = root.cssselect(website['priceselector'])\n", " if price_elements:\n", " for price_el in price_elements:\n", " if price_el.text is not None:\n", " if any(char.isdigit() for char in price_el.text):\n", " price = price_el.text\n", " price = re.sub(r'([^a-zA-Z]\\w+\\%+)', '', price)\n", " break\n", " else:\n", " price = '-1'\n", " else:\n", " price = '-1'\n", " if website['pricedelimitertoignore']:\n", " if website['pricedelimitertoignore'].strip().find(' ') != -1:\n", " sepdelimiters = website['pricedelimitertoignore'].strip().split(' ')\n", " for delim in sepdelimiters:\n", " price = re.sub('\\\\' + delim.strip() + '', '', price)\n", " else:\n", " price = re.sub('\\\\' + website['pricedelimitertoignore'].strip() + '', '', price) \n", " if website['currencysymbol']:\n", " #print('PRICEBEFORECONVERSION:' + price)\n", " #print('PRICE ELEMENTS:')\n", " #for p in price_elements: print p\n", " #if not re.match('[0-9]', price): price = ''\n", " price = converttocorrectprice(price, website['currencysymbol'], headers)\n", " else:\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " #print('FINALPRICE:' + price)\n", " except:\n", " #print(\"Error when scraping price for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " # >>> GET THE SALES PRICE <<< #\n", " salesprice_elements = ''\n", " salesprice = ''\n", " if website['salespriceselector']:\n", " try:\n", " website['salespriceselector'] = website['salespriceselector'].encode().decode(\"unicode-escape\")\n", " #salesprice_elements = browser.find_by_css(website['salespriceselector'])\n", " salesprice_elements = root.cssselect(website['salespriceselector'])\n", " if salesprice_elements:\n", " if any(char.isdigit() for char in salesprice_elements[0].text):\n", " salesprice = salesprice_elements[0].text\n", " salesprice = re.sub(r'([^a-zA-Z]\\w+\\%+)', '', salesprice)\n", " else:\n", " salesprice = '-1'\n", " else:\n", " salesprice = '-1'\n", " if website['pricedelimitertoignore']:\n", " if website['pricedelimitertoignore'].strip().find(' ') != -1:\n", " sepdelimiters = website['pricedelimitertoignore'].strip().split(' ')\n", " for delim in sepdelimiters:\n", " salesprice = re.sub('\\\\' + delim.strip() + '', '', salesprice)\n", " else:\n", " salesprice = re.sub('\\\\' + website['pricedelimitertoignore'].strip() + '', '', salesprice) \n", "\n", " if website['currencysymbol']:\n", " #if not re.match('[0-9]', salesprice): salesprice = ''\n", " salesprice = converttocorrectprice(salesprice, website['currencysymbol'], headers)\n", " else:\n", " salesprice = salesprice.replace(r'[^0-9,.]', '')\n", " salesprice = getmoneyfromtext(salesprice)\n", " #print('FINALSALESPRICE:' + salesprice)\n", " except:\n", " #print(\"Error when scraping sales price for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " # >>> GET THE DOMAIN MISC. ELEMENTS <<< #\n", " domainmisc_array = ''\n", " if website['domainmisc']:\n", " try:\n", " domainmisc_array = re.split('{|}', website['domainmisc'])\n", " for i in range(1, len(domainmisc_array), 2):\n", " domainmisc_array[i] = root.cssselect(domainmisc_array[i])\n", " #domainmisc_array[i] = browser.find_by_css(domainmisc_array[i])\n", " #print('DOMAINMISC:')\n", " #for d in domainmisc_array: print d\n", " except:\n", " #print(\"Error when scraping misc. domain information for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " # >>> GET THE PRODUCT LOGO URL(S) - IF SUCH EXISTS <<< #\n", " #prodlog_image_urls = ''\n", " #prodlog_image_elements = ''\n", " prodlog_image_urls = ''\n", " productlogourl = ''\n", " #productlogo = ''\n", " if website['productlogoselector']:\n", " try:\n", " website['productlogoselector'] = website['productlogoselector'].encode().decode(\"unicode-escape\")\n", " #prodlog_image_elements = browser.find_by_css(website['productlogoselector'])\n", " prodlog_image_elements = root.cssselect(website['productlogoselector'])\n", " if prodlog_image_elements:\n", " for i in range(len(prodlog_image_elements)):\n", " #print('HUPP: ' + prodlog_image_elements[i])\n", " prodlog_image_elements[i] = str(etree.tostring(prodlog_image_elements[i]))\n", " #prodlog_image_elements[i] = prodlog_image_elements[i]._element.get_attribute('outerHTML')\n", " image_dom = ','.join(prodlog_image_elements)\n", " #print('IMAGEDOM: ' + image_dom)\n", " if altimggrab == '1':\n", " output = re.search(r'image\\=\\\"(.*?)\\\"', image_dom)\n", " if len(output.group(1)) > 0:\n", " prodlog_image_urls = { 0 : output.group(1) }\n", " elif altimggrab == '2':\n", " output = re.search(r'src\\=\\\"(.*?)\\\"', image_dom)\n", " if len(output.group(1)) > 0:\n", " prodlog_image_urls = { 0 : output.group(1) }\n", " else:\n", " prodlog_image_urls = graburls(str(image_dom), True)\n", " if len(prodlog_image_urls) > 0:\n", " for imagekey, imageval in prodlog_image_urls.copy().items():\n", " #print('OLD: ' + imageval)\n", " newimageval = urljoin(product['url'], imageval)\n", " #print('NEW: ' + newimageval)\n", " if imageval != newimageval:\n", " prodlog_image_urls[imagekey] = newimageval\n", " imageval = newimageval\n", " if imageval.find('//') == -1:\n", " del prodlog_image_urls[imagekey]\n", " continue\n", " if imageval[0:2] == '//':\n", " imageval = 'https:' + imageval\n", " prodlog_image_urls[imagekey] = imageval\n", " #print('HEPP')\n", " #for prod in prodlog_image_urls: print(prod)\n", " productlogourl = prodlog_image_urls[0] \n", " else:\n", " print(\"No product logo URLs could be found for product ID \" + product['productid'] + \"!\")\n", " #print('PRODUCTLOGOS:')\n", " #for p in prodlog_image_urls: print(p)\n", " #print('PRODUCTLOGOURL:' + productlogourl)\n", " except:\n", " #print(\"Error when scraping product logo images for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " # >>> GET THE IMAGE URL(S) <<< #\n", " image_urls = ''\n", " image_elements = ''\n", " image_urls_valid = ''\n", " images = ''\n", " if website['imageselector'] and len(website['imageselector']):\n", " try:\n", " website['imageselector'] = website['imageselector'].encode().decode(\"unicode-escape\")\n", " #image_urls = ''\n", " #image_elements = browser.find_by_css(website['imageselector'])\n", " image_elements = root.cssselect(website['imageselector'])\n", " if image_elements:\n", " for i in range(len(image_elements)):\n", " image_elements[i] = str(etree.tostring(image_elements[i]))\n", " #image_elements[i] = image_elements[i]._element.get_attribute('outerHTML')\n", " image_dom = ','.join(image_elements)\n", " #print('IMAGE DOM: ' + image_dom)\n", " if altimggrab == '1':\n", " output = re.finditer(r'image\\=\\\"(.*?)\\\"', image_dom)\n", " array_output = []\n", " for output_el in output:\n", " array_output.append(output_el.group(1))\n", " if len(array_output) > 0:\n", " image_urls = { i : array_output[i] for i in range(0, len(array_output)) }\n", " elif altimggrab == '2':\n", " output = re.search(r'src\\=\\\"(.*?)\\\"', image_dom)\n", " if len(output.group(1)) > 0:\n", " image_urls = { 0 : output.group(1) }\n", " else:\n", " image_urls = graburls(str(image_dom), True)\n", " #print('PRE-IMAGE URLS: ')\n", " #for img in image_urls: print(img)\n", " if len(image_urls) > 0:\n", " for imagekey, imageval in image_urls.copy().items():\n", " newimageval = urljoin(product['url'], imageval)\n", " if imageval != newimageval:\n", " image_urls[imagekey] = newimageval\n", " imageval = newimageval\n", " if imageval.find('//') == -1 or imageval.find('blank.') != -1:\n", " del image_urls[imagekey]\n", " continue\n", " if imageval[0:2] == '//':\n", " imageval = 'https:' + imageval\n", " image_urls[imagekey] = imageval\n", " if skip_from_img_url != '':\n", " #print('TO SKIP:')\n", " #print(':::' + skip_from_img_url + ':::')\n", " #print(image_urls[imagekey])\n", " image_urls[imagekey] = image_urls[imagekey].replace(skip_from_img_url, '')\n", " #print(image_urls[imagekey])\n", " image_urls_valid = sorted(list(image_urls.values()))\n", " #print('IMAGE ELEMENTS:')\n", " #for img in image_elements: print img\n", " #print('IMAGE URLS:')\n", " #for img in image_urls: print img\n", " #print('VALID IMAGES:')\n", " #for img in image_urls_valid: print img\n", " except:\n", " #print(\"Error when scraping images for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " # >>> GET THE PRODUCT MISC. ELEMENTS <<< #\n", " productmisc_array = re.split('{|}', website['productmisc'])\n", " #print('PRODUCTMISCARRAY BEFORE:')\n", " #for p in productmisc_array: print p\n", " # --> Define containers for product attributes\n", " product_brand = ''\n", " product_colors = ''\n", " product_sex = ''\n", " product_sizes = ''\n", " product_sizetypes = ''\n", " product_sizetypemiscs = ''\n", " product_categories = ''\n", " # --> Define values that will be saved to database once done:\n", " sizetypemisc = ''\n", " preexistingcurrency = ''\n", " notfound = False\n", " notavailable = False\n", " skipfinalsave = False\n", " shouldremoveonnotfound = False\n", " soldoutupdatemeta = False\n", " soldouthtmlupdatemeta = False\n", " catstoaddresult = ''\n", " attributes_to_store = ''\n", " insert_sizetosizetype = ''\n", " remove_sizetosizetype = ''\n", " insert_sizetosizetypemisc = ''\n", " remove_sizetosizetypemisc = ''\n", " skip_exist_attr = [0, 0, 0, 0, 0, 0, 0] # <==> [brand, color, sex, size, s-type, s-t-misc, categories]\n", " skip_exist_attr_prodtitle = [0, 0, 0, 0] # <==> [brand, color, sex, categories]\n", " size_handling_options = [[0, '', '']] # <==> 0 = round up; 1 = round down; 2 = round as division up;\n", " # <==> CONT. 3 = round as division down; 4 = round uneven up; 5 = round uneven down;\n", " # <==> CONT. When seperating sizes by char: 6 = keep all sizes after split; 7 = keep first size; 8 = keep second size;\n", " # <==> CONT. ::: After the ';' character, type the name of the sizetype you wish to handle the sizes for.\n", " # <==> CONT. The last field is only used if you wish to seperate the sizes by a specific character!\n", " # <==> IMPORTANT ::: Type 'ALL' as sizetype if you wish for the first setting to be applied to all sizetypes!\n", " mandatory_sizes = [[['ONE SIZE'], 'Accessories']]\n", " no_whitespace_htmlregex = False\n", " no_whitespace_prodtitleregex = False\n", " whitespace_htmlregex_sides = 0 # <==> 0 = Whitespaces on both sides, 1 = Whitespace on left side, 2 = Right side, 3 = Either/Or\n", " whitespace_prodtitleregex_sides = 0 # <==> 0 = Whitespaces on both sides, 1 = Whitespace on left side, 2 = Right side, 3 = Either/Or\n", " apply_finalsize_as_size = False\n", " # --> Define misc. storage variables\n", " domain_name = ''\n", " # --> Get 'em!\n", " if website['productmisc']:\n", " try:\n", " for i in range(2, len(productmisc_array), 2):\n", " #print(productmisc_array[(i-1)])\n", " #print(productmisc_array[i])\n", " # --- No leading/trailing whitespaces when using regex while searching prod. title for attributes? --- #\n", " if productmisc_array[(i-1)] == 'apply_finalsize_as_size':\n", " apply_finalsize_as_size = True\n", " # --- Any specific way the sizes should be handled? --- #\n", " if productmisc_array[(i-1)] == 'size_handle':\n", " if productmisc_array[i] != 'true':\n", " size_handle_arrs = productmisc_array[i].strip().split('|')\n", " for size_handle_arr in size_handle_arrs:\n", " size_handle_arr = size_handle_arr.strip().split(':')\n", " if len(size_handle_arr) < 3:\n", " size_handling_options.append([ int(size_handle_arr[0]), size_handle_arr[1], '/' ])\n", " else:\n", " size_handling_options.append([ int(size_handle_arr[0]), size_handle_arr[1], size_handle_arr[2] ])\n", " productmisc_array[i] = 'true'\n", " # --- Set product as 'Not Available' if the product has been found but the price is not available? --- #\n", " if productmisc_array[(i-1)] == 'allow_not_available':\n", " if price == '1':\n", " notavailable = True\n", " # --- No leading/trailing whitespaces when using regex while searching prod. title for attributes? --- #\n", " if productmisc_array[(i-1)] == 'no_whitespace_prodtitleregex':\n", " no_whitespace_prodtitleregex = True\n", " # --- Should whitespace in HTML regex attribute search be applied on a specific side? --- #\n", " if productmisc_array[(i-1)] == 'whitespace_prodtitleregex_sides':\n", " if productmisc_array[i] != 'true':\n", " whitespace_prodtitleregex_sides = int(productmisc_array[i])\n", " productmisc_array[i] = 'true'\n", " # --- No leading/trailing whitespaces when using regex while searching pure HTML for attributes? --- #\n", " if productmisc_array[(i-1)] == 'no_whitespace_htmlregex':\n", " no_whitespace_htmlregex = True\n", " # --- Should whitespace in HTML regex attribute search be applied on a specific side? --- #\n", " if productmisc_array[(i-1)] == 'whitespace_htmlregex_sides':\n", " if productmisc_array[i] != 'true':\n", " whitespace_htmlregex_sides = int(productmisc_array[i])\n", " productmisc_array[i] = 'true'\n", " # --- Are the sizes belonging to the current product of a a specific misc. size type? --- #\n", " if productmisc_array[(i-1)] == 'sizetypemisc':\n", " sizetypemisc = productmisc_array[i]\n", " # --- Should we skip any already existing product attributes when scraping the product? --- #\n", " if productmisc_array[(i-1)] == 'skip_exist_attr':\n", " if productmisc_array[i] != 'true':\n", " skip_exist_attr = [ int(skipval) for skipval in productmisc_array[i].strip().split(',') ]\n", " productmisc_array[i] = 'true'\n", " # --- Should we skip any already existing product attributes when scraping the product? --- #\n", " if productmisc_array[(i-1)] == 'skip_exist_attr_prodtitle':\n", " if productmisc_array[i] != 'true':\n", " skip_exist_attr_prodtitle = [ int(skipval) for skipval in productmisc_array[i].strip().split(',') ]\n", " productmisc_array[i] = 'true'\n", " # --- Should we skip any sizes that correspond with other page elements on a certain condition? --- #\n", " # !!! IMPORTANT --- PUT THIS AFTER ALL STANDARD SIZES HAVE BEEN IMPORTED FROM PRODUCTMISC STRING! --- IMPORTANT !!! #\n", " if productmisc_array[(i-1)] == 'skip_pa_size_on_corrsp':\n", " if product_sizes != '':\n", " corrsp_elements = productmisc_array[i].split(',')\n", " corrsp_elements[0] = root.cssselect(corrsp_elements[0].encode().decode(\"unicode-escape\"))\n", " corrsp_elements[1] = corrsp_elements[1].strip().split('|')\n", " if corrsp_elements[1][0] == 'bool_text':\n", " count = 0\n", " for el in corrsp_elements[0]:\n", " if el.text is not None:\n", " if el.text == corrsp_elements[1][1]:\n", " del product_sizes[count]\n", " continue\n", " count += 1\n", " productmisc_array[i] = 'true'\n", " # --- Are there any pre-existing currencies to apply to the price(s)? --- #\n", " if productmisc_array[(i-1)] == 'pre_existing_currency':\n", " preexistingcurrency = productmisc_array[i]\n", " newprice = ''\n", " newprice = price + productmisc_array[i].strip()\n", " if website['currencysymbol']:\n", " newprice.upper()\n", " #if not re.match('[0-9]', newprice): newprice = ''\n", " newprice = converttocorrectprice(newprice, website['currencysymbol'], headers)\n", " else:\n", " newprice = newprice.replace(r'[^0-9,.]', '')\n", " newprice = getmoneyfromtext(newprice)\n", " price = newprice\n", " if salesprice != '':\n", " newprice = ''\n", " newprice = salesprice + productmisc_array[i].strip()\n", " if website['currencysymbol']:\n", " newprice.upper()\n", " #if not re.match('[0-9]', newprice): newprice = ''\n", " newprice = converttocorrectprice(newprice, website['currencysymbol'], headers)\n", " else:\n", " newprice = newprice.replace(r'[^0-9,.]', '')\n", " newprice = getmoneyfromtext(newprice)\n", " salesprice = newprice\n", " # --- Should the product skip any URLs(Product logo and normal IMGs) containing any specific string(s)? --- #\n", " if productmisc_array[(i-1)] == 'skip_img_containing':\n", " if image_urls_valid != '':\n", " count = 0\n", " for e in range(0, len(image_urls_valid), 1):\n", " if image_urls_valid[(e+count)].find(productmisc_array[i].strip()) != -1:\n", " del image_urls_valid[e+count]\n", " count-=1\n", " images = ','.join(image_urls_valid)\n", " if prodlog_image_urls != '':\n", " for imagekey, imageval in prodlog_image_urls.copy().items():\n", " if imageval.find(productmisc_array[i].strip()) != -1:\n", " del prodlog_image_urls[imagekey]\n", " productlogourl = prodlog_image_urls[0] \n", " # --- Should we remove the product on 404 Error? --- #\n", " if productmisc_array[(i-1)] == 'allow_remove_on_404':\n", " shouldremoveonnotfound = True\n", " # --- Use custom domain name(In case any brands doesn't exist for current product) --- #\n", " if productmisc_array[(i-1)] == 'domain_name':\n", " brand_array = []\n", " if productmisc_array[i] != '':\n", " brand_termus = productmisc_array[i].strip()\n", " domain_name = brand_termus\n", " clean_brand = slugify(brand_termus.strip())\n", " term = doesprodattrexist(jsonprodattr['pa_brand'], brand_termus, 'pa_brand')\n", " #TUPPLE STRUCTURE: (Term(ID/NAME/SLUG), newtermTrue_existingtermFalse)\n", " if term:\n", " brand_array.append((term, False))\n", " else:\n", " term = {'term_id':-1, 'name':brand_termus, 'slug':clean_brand, 'taxonomy':'pa_brand'}\n", " brand_array.append((term, True))\n", " product_brand = brand_array\n", " productmisc_array[i] = '.somethingelse'\n", " # --- Should the product apply a specific category automatically? --- #\n", " if productmisc_array[(i-1)] == 'add_category':\n", " cats_to_add = ','.split(productmisc_array[i])\n", " cat_result = []\n", " for cat in cats_to_add:\n", " clean_cat = slugify(cat.strip())\n", " term = doesprodattrexist(jsonprodattr['product_cat'], cat, 'product_cat')\n", " #TUPPLE STRUCTURE: (Term(ID/NAME/SLUG), newtermTrue_existingtermFalse, resultcats)\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], cat_result)):\n", " cat_result.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], cat_result)):\n", " cat_result.append((parent, False))\n", " else:\n", " term = {'term_id':-1, 'name':cat, 'slug':clean_cat, 'taxonomy':'product_cat'}\n", " cat_result.append((term, True))\n", " product_categories = cat_result\n", " # --- Should the product apply the male/female attribute automatically? --- #\n", " # --- !!! IMPORTANT --> IF THIS SHOULD OVERRIDE OTHER SEX ATTR. IMPORTS, !!! --- #\n", " # --- !!! THEN PUT THIS LAST IN ORDER IN PRODUCTMISC. TEXT FIELD BEFORE SCRAPING !!! --- #\n", " if product_sex == '':\n", " if productmisc_array[(i-1)] == 'is_male':\n", " product_sex = [(doesprodattrexist(jsonprodattr['pa_sex'], 'Male', 'pa_sex'), False)]\n", " elif productmisc_array[(i-1)] == 'is_female':\n", " product_sex = [(doesprodattrexist(jsonprodattr['pa_sex'], 'Female', 'pa_sex'), False)]\n", " #print('SEX VALUES:')\n", " #print(i)\n", " #for sex in product_sex: print(sex)\n", " # --> Attempt scraping of product misc. elements:\n", " prodmisc_backup = productmisc_array[i].strip().encode().decode(\"unicode-escape\")\n", " #productmisc_array[i] = browser.find_by_css(productmisc_array[i].encode().decode(\"unicode-escape\"))\n", " productmisc_array[i] = root.cssselect(productmisc_array[i].encode().decode(\"unicode-escape\"))\n", " if productmisc_array[i]:\n", " # --- Has the product got any special sale price applied? --- #\n", " if productmisc_array[(i-1)] == 'before_sale_price':\n", " if len(productmisc_array[i]) > 0:\n", " newprice = productmisc_array[i][0].text\n", " if website['currencysymbol']:\n", " newprice.upper()\n", " if website['pricedelimitertoignore']:\n", " if website['pricedelimitertoignore'].strip().find(' ') != -1:\n", " sepdelimiters = website['pricedelimitertoignore'].strip().split(' ')\n", " for delim in sepdelimiters:\n", " newprice = re.sub('\\\\' + delim.strip() + '', '', newprice)\n", " else:\n", " newprice = re.sub('\\\\' + website['pricedelimitertoignore'].strip() + '', '', newprice) \n", " #if not re.match('[0-9]', newprice): newprice = ''\n", " newprice = converttocorrectprice(newprice, website['currencysymbol'], headers)\n", " else:\n", " newprice = newprice.replace(r'[^0-9,.]', '')\n", " newprice = getmoneyfromtext(newprice) \n", " salesprice = price\n", " price = newprice\n", " # --- Get sex attributes from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_sex':\n", " if len(productmisc_array[i]) > 0:\n", " sex_array = []\n", " for sex_termus in productmisc_array[i]:\n", " sex_termus = sex_termus.text\n", " check_sex = sex_termus.lower()\n", " if check_sex == 'men' or check_sex == 'man':\n", " sex_termus = 'male'\n", " elif check_sex == 'women' or check_sex == 'woman':\n", " sex_termus = 'female'\n", " clean_sex = sex_termus.strip()\n", " term = doesprodattrexist(jsonprodattr['pa_sex'], sex_termus, 'pa_sex')\n", " #TUPPLE STRUCTURE: (Term(ID/NAME/SLUG), newtermTrue_existingtermFalse)\n", " if term:\n", " sex_array.append((term, False))\n", " else:\n", " term = {'term_id':-1, 'name':sex_termus, 'slug':clean_sex, 'taxonomy':'pa_sex'}\n", " sex_array.append((term, True))\n", " product_sex = sex_array\n", " # --- Get brand attribute(s) from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_brand':\n", " brand_array = []\n", " if len(productmisc_array[i]) > 0 and productmisc_array[i][0] is not None:\n", " brand_termus = productmisc_array[i][0].text\n", " if brand_termus is not None:\n", " clean_brand = slugify(brand_termus.strip())\n", " term = doesprodattrexist(jsonprodattr['pa_brand'], brand_termus, 'pa_brand')\n", " # TUPPLE STRUCTURE: (Term(ID/NAME/SLUG), newtermTrue_existingtermFalse)\n", " if term:\n", " brand_array.append((term, False))\n", " else:\n", " term = {'term_id':-1, 'name':brand_termus, 'slug':clean_brand, 'taxonomy':'pa_brand'}\n", " brand_array.append((term, True))\n", " product_brand = brand_array\n", " # --- Get size attributes from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_size':\n", " if len(productmisc_array[i]) > 0:\n", " #PRINT('SIZES, PRODUCTMISC_SCRAPES:')\n", " #for size in productmisc_array[i]: print(size)\n", " size_array = []\n", " for size_termus in productmisc_array[i]:\n", " #print(etree.tostring(size_termus))\n", " size_text = size_termus.text\n", " #print(size_termus)\n", " #print(size_termus.text)\n", " #print(\"\".join(size_termus.itertext()))\n", " if size_text is None:\n", " if size_termus.tail is None:\n", " size_termus = \"\".join(size_termus.itertext())\n", " else:\n", " size_termus = size_termus.tail\n", " else:\n", " size_termus = size_text\n", " output = re.search(r'\\(.*Only.*\\)|\\(.*Out.*\\)|\\(.*In.*\\)|\\(.*Lager.*\\)', size_termus, flags=re.IGNORECASE)\n", " output2 = re.search(r'.*Bevaka.*', size_termus, flags=re.IGNORECASE)\n", " output3 = re.search(r'.*Stock.*', size_termus, flags=re.IGNORECASE)\n", " output4 = re.search(r'.*Size\\s+\\d+.*', size_termus, flags=re.IGNORECASE)\n", " output5 = re.search(r'.*Choose.*', size_termus, flags=re.IGNORECASE)\n", " output6 = re.search(r'.*Empty.*', size_termus, flags=re.IGNORECASE)\n", " output7 = re.search(r'.*Select.*', size_termus, flags=re.IGNORECASE)\n", " output8 = re.search(r'.*storlek.*', size_termus, flags=re.IGNORECASE)\n", " if output is not None:\n", " size_termus = re.sub(r'\\(.*\\)', '', size_termus, flags=re.IGNORECASE)\n", " elif output2 is not None:\n", " size_termus = re.sub(r'\\s+-\\s+Bevaka.*', '', size_termus, flags=re.IGNORECASE)\n", " elif output3 is not None:\n", " size_termus = re.sub(r'\\s+-\\s+.*Stock.*', '', size_termus, flags=re.IGNORECASE)\n", " elif output4 is not None:\n", " size_termus = re.sub(r'.*Size\\s+', '', size_termus, flags=re.IGNORECASE)\n", " elif output5 is not None:\n", " size_termus = re.sub(r'.*Choose.*', '', size_termus, flags=re.IGNORECASE)\n", " elif output6 is not None:\n", " size_termus = re.sub(r'.*Empty.*', '', size_termus, flags=re.IGNORECASE)\n", " elif output7 is not None:\n", " size_termus = re.sub(r'.*Select.*', '', size_termus, flags=re.IGNORECASE)\n", " elif output8 is not None:\n", " size_termus = re.sub(r'.*storlek.*', '', size_termus, flags=re.IGNORECASE)\n", " size_termus = size_termus.replace(' ', '').replace('\\n', '')\n", " clean_size = slugify(size_termus.strip())\n", " term = doesprodattrexist(jsonprodattr['pa_size'], size_termus, 'pa_size')\n", " if term:\n", " size_array.append((term, False))\n", " else:\n", " term = {'term_id':-1, 'name':size_termus, 'slug':clean_size, 'taxonomy':'pa_size'}\n", " size_array.append((term, True))\n", " product_sizes = size_array\n", " # --- Get color attributes from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_color':\n", " if len(productmisc_array[i]) > 0:\n", " color_array = []\n", " for color_termus in productmisc_array[i]:\n", " color_termus = color_termus.text\n", " if color_termus is not None:\n", " clean_color = slugify(color_termus.strip())\n", " term = doesprodattrexist(jsonprodattr['pa_color'], color_termus, 'pa_color')\n", " if term:\n", " color_array.append((term, False))\n", " else:\n", " term = {'term_id':-1, 'name':color_termus, 'slug':clean_color, 'taxonomy':'pa_color'}\n", " color_array.append((term, True))\n", " product_colors = color_array\n", " # --- Get categories from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_category':\n", " if len(productmisc_array[i]) > 0:\n", " category_array = []\n", " for cat_termus in productmisc_array[i]:\n", " cat_termus = cat_termus.text\n", " clean_cat = slugify(cat_termus.strip())\n", " term = doesprodattrexist(jsonprodattr['product_cat'], cat_termus, 'product_cat')\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], category_array)):\n", " category_array.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], category_array)):\n", " category_array.append((parent, False))\n", " else:\n", " term = {'term_id':-1, 'name':cat_termus, 'slug':clean_cat, 'taxonomy':'product_cat'}\n", " category_array.append((term, True))\n", " if category_array:\n", " if product_categories != '':\n", " product_categories = array_merge(product_categories, category_array)\n", " else:\n", " product_categories = category_array\n", " # --> Check if any product fixes should be applied for category check!\n", " if jsonprodfixes:\n", " cat_prodfix_regex_list = [[re.sub('\\{pa_category\\}', '', i['selectionfield']),\\\n", " i['actionfield']] for i in jsonprodfixes if '{pa_category}' in i['selectionfield']]\n", " product_categories_names = [i[0]['name'] for i in product_categories]\n", " for fix in cat_prodfix_regex_list:\n", " cat_prodfix_names = fix[0].split(',')\n", " found_names = [i for i in product_categories_names if i in cat_prodfix_names] \n", " if len(found_names) > 0:\n", " if re.search('{remove_product}', fix[1], flags=re.IGNORECASE):\n", " notfound = True\n", " removeon404 = True\n", " try:\n", " scraperwiki.sqlite.save(unique_keys=['productid'],\\\n", " data={'productid': product['productid'],\\\n", " 'url': product['url'],\\\n", " 'domain': product['domain'],\\\n", " 'price': '',\\\n", " 'salesprice': '',\\\n", " 'domainmisc': '',\\\n", " 'prodlogurls': '',\\\n", " 'prodlogurl': '',\\\n", " 'finalimgurls': '',\\\n", " 'validimgurls': '',\\\n", " 'imgurls': '',\\\n", " 'notfound': notfound,\\\n", " 'notavailable': True,\\\n", " 'removeon404': removeon404,\\\n", " 'soldoutfix': 0,\\\n", " 'soldouthtmlfix': 0,\\\n", " 'catstoaddresult': '',\\\n", " 'attributes': '',\\\n", " 'sizetypemapsqls': ''})\n", " totalscrapedcount = totalscrapedcount + 1\n", " skipfinalsave = True\n", " except:\n", " #print(\"Error: \" + str(sys.exc_info()[0]) + \" occured!\")\n", " print(traceback.format_exc()) \n", " # --- Is the product no longer existing - Does the page for it not exist anymore? --- #\n", " if productmisc_array[(i-1)] == 'notfound':\n", " if len(productmisc_array[i]) > 0:\n", " notfound = True\n", " # --- Has the product sold out yet? --- #\n", " if productmisc_array[(i-1)] == 'sold_out':\n", " if len(productmisc_array[i]) > 0:\n", " soldoutupdatemeta = True\n", " price = '0.0 BUCKS'\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " else:\n", " soldoutupdatemeta = False\n", " # --> Check the HTML if neccessary! Any already existing product attributes found there?\n", " #productmisc_array[i] = lxml.html.tostring(productmisc_array[i])\n", " #productmisc_array[i] = etree.tostring(productmisc_array[i][0])\n", " #selected = browser.find_by_css(prodmisc_backup.strip().encode().decode(\"unicode-escape\"))\n", " selected = root.cssselect(prodmisc_backup.strip().encode().decode(\"unicode-escape\"))\n", " #productmisc_array[i] = selected[0].html\n", " productmisc_array[i] = str(etree.tostring(selected[0]))\n", " # --- Get sex attributes from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_sex_html':\n", " sexies = jsonprodattr['pa_sex']\n", " sexies_result = []\n", " for sexterm in sexies:\n", " term_name = sexterm['name']\n", " sex_html = str(productmisc_array[i])\n", " regex = ''\n", " if term_name == 'Male':\n", " regex = r'\\bmale\\b|\\bmen\\b|\\bman\\b'\n", " elif term_name == 'Female':\n", " regex = r'\\bfemale\\b|\\bwomen\\b|\\bwoman\\b'\n", " #if sex_html.upper().find(term_name.upper()) != -1:\n", " if re.search(regex, sex_html, flags=re.IGNORECASE):\n", " term = doesprodattrexist(jsonprodattr['pa_sex'], sexterm['term_id'], 'pa_sex')\n", " if term:\n", " sexies_result.append((term, False))\n", " product_sex = sexies_result\n", " # --- Get size attribute(s) from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_size_html':\n", " sizies = jsonprodattr['pa_size']\n", " sizies_result = []\n", " for sizeterm in sizies:\n", " term_name = sizeterm['name']\n", " size_html = str(productmisc_array[i])\n", " output = re.search(r'\\(.*Only.*\\)|\\(.*Out.*\\)|\\(.*In.*\\)|\\(.*Lager.*\\)', size_html, flags=re.IGNORECASE)\n", " output2 = re.search(r'.*Bevaka.*', size_html, flags=re.IGNORECASE)\n", " output3 = re.search(r'.*Stock.*', size_html, flags=re.IGNORECASE)\n", " output4 = re.search(r'.*Size\\s+\\d+.*', size_html, flags=re.IGNORECASE)\n", " output5 = re.search(r'.*Choose.*', size_html, flags=re.IGNORECASE)\n", " output6 = re.search(r'.*Empty.*', size_html, flags=re.IGNORECASE)\n", " output7 = re.search(r'.*Select.*', size_html, flags=re.IGNORECASE)\n", " output8 = re.search(r'.*storlek.*', size_html, flags=re.IGNORECASE)\n", " if output is not None:\n", " size_html = re.sub(r'\\(.*\\)', '', size_html, flags=re.IGNORECASE)\n", " elif output2 is not None:\n", " size_html = re.sub(r'\\s+-\\s+Bevaka.*', '', size_html, flags=re.IGNORECASE)\n", " elif output3 is not None:\n", " size_html = re.sub(r'\\s+-\\s+.*Stock.*', '', size_html, flags=re.IGNORECASE)\n", " elif output4 is not None:\n", " size_html = re.sub(r'.*Size\\s+', '', size_html, flags=re.IGNORECASE)\n", " elif output5 is not None:\n", " size_html = re.sub(r'.*Choose.*', '', size_html, flags=re.IGNORECASE)\n", " elif output6 is not None:\n", " size_html = re.sub(r'.*Empty.*', '', size_html, flags=re.IGNORECASE)\n", " elif output7 is not None:\n", " size_html = re.sub(r'.*Select.*', '', size_html, flags=re.IGNORECASE)\n", " elif output8 is not None:\n", " size_html = re.sub(r'.*storlek.*', '', size_html, flags=re.IGNORECASE)\n", " if size_html.upper().find(term_name.upper()) != -1:\n", " term = doesprodattrexist(jsonprodattr['pa_size'], sizeterm['term_id'], 'pa_size')\n", " if term:\n", " sizies_result.append((term, False))\n", " if sizies_result:\n", " if product_sizes == '':\n", " product_sizes = sizies_result\n", " else:\n", " product_sizes = array_merge(product_sizes, sizies_result)\n", " # --- Get brand attributes from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_brand_html':\n", " brandies = jsonprodattr['pa_brand']\n", " brandies_result = []\n", " for brandterm in brandies:\n", " term_name = brandterm['name']\n", " brand_html = str(productmisc_array[i])\n", " if brand_html.upper().find(term_name.upper()) != -1:\n", " term = doesprodattrexist(jsonprodattr['pa_brand'], brandterm['term_id'], 'pa_brand')\n", " if term:\n", " brandies_result.append((term, False))\n", " if brandies_result:\n", " if product_brand == '':\n", " product_brand = brandies_result\n", " else:\n", " product_brand = array_merge(product_brand, brandies_result)\n", " # --- Get categories from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_category_html':\n", " caties = jsonprodattr['product_cat']\n", " caties_result = []\n", " #print('CATHTML: ' + str(productmisc_array[i]))\n", " for catterm in caties:\n", " term_name = catterm['name']\n", " cat_html = str(productmisc_array[i])\n", " if translate_pa_category_html != '':\n", " langs_to_trnsl = translate_pa_category_html.split(',')\n", " translator = Translator(to_lang=langs_to_trnsl[0], from_lang=langs_to_trnsl[1])\n", " cat_html = translator.translate(cat_html)\n", " array_categorymaps = jsoncatmaps\n", " #print(type(cat_html))\n", " #print(type(term_name))\n", " leftside = '\\s' if whitespace_htmlregex_sides != 2 else ''\n", " rightside = '\\s' if whitespace_htmlregex_sides != 1 else ''\n", " if array_categorymaps:\n", " #if hasattr(array_categorymaps, term_name):\n", " if term_name in array_categorymaps:\n", " #print('HERE!')\n", " infliction_array = jsoncatmaps[term_name]['catinflections'].split(',')\n", " for infliction in infliction_array:\n", " #print('INFLICTION: ' + infliction)\n", " #if cat_html.upper().find(r'\\s'+infliction.upper()+r'\\s') != -1:\n", " regex = ''\n", " if no_whitespace_htmlregex is True:\n", " regex = ''+infliction.strip()+''\n", " else:\n", " if whitespace_htmlregex_sides == 3:\n", " regex = leftside+infliction.strip()+'|'+infliction.strip()+rightside\n", " else:\n", " regex = leftside+infliction.strip()+rightside\n", " #print('INF_REGEX: ' + regex)\n", " if re.search(regex, cat_html, flags=re.IGNORECASE):\n", " #print('FOUND INFLICTION!')\n", " term = doesprodattrexist(jsonprodattr['product_cat'], catterm['term_id'], 'product_cat')\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], caties_result)):\n", " caties_result.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], caties_result)):\n", " caties_result.append((parent, False))\n", " #print('CATTERM: ' + term_name)\n", " #if cat_html.upper().find(r'\\s'+term_name.upper()+r'\\s') != -1:\n", " regex = ''\n", " if no_whitespace_htmlregex is True:\n", " regex = ''+term_name.strip()+''\n", " else:\n", " if whitespace_htmlregex_sides == 3:\n", " regex = leftside+term_name.strip()+'|'+term_name.strip()+rightside\n", " else:\n", " regex = leftside+term_name.strip()+rightside\n", " #print('CATTERM_REGEX: ' + regex)\n", " if re.search(regex, cat_html, flags=re.IGNORECASE):\n", " #print('FOUND CATTERM!')\n", " term = doesprodattrexist(jsonprodattr['product_cat'], catterm['term_id'], 'product_cat')\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], caties_result)):\n", " caties_result.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], caties_result)):\n", " caties_result.append((parent, False))\n", " if caties_result:\n", " if product_categories == '':\n", " product_categories = caties_result\n", " else:\n", " product_categories = array_merge(product_categories, caties_result)\n", " #print('PA_CAT_HTML_CATS: '+json.dumps(product_categories))\n", " # --> Check if any product fixes should be applied for category HTML check!\n", " if jsonprodfixes:\n", " cat_prodfix_regex_list = [[re.sub('\\{regex_in_pa_category_html\\}', '', i['selectionfield']),\\\n", " i['actionfield']] for i in jsonprodfixes if 'regex_in_pa_category_html' in i['selectionfield']]\n", " cat_html = str(productmisc_array[i])\n", " for fix in cat_prodfix_regex_list:\n", " if re.search(fix[0], cat_html, flags=re.IGNORECASE):\n", " if product_categories == \"\":\n", " product_categories = []\n", " if re.search('{remove_category}', fix[1], flags=re.IGNORECASE):\n", " #print('HERE!')\n", " cats_to_remove = re.sub('\\{remove_category\\}', '', fix[1]).split(',')\n", " for cat_remove in cats_to_remove:\n", " cat_remove_name = cat_remove.strip()\n", " product_categories = list(filter(lambda x: not re.search(cat_remove_name, x[0]['name'],\\\n", " flags=re.IGNORECASE), product_categories))\n", " term = doesprodattrexist(jsonprodattr['product_cat'], cat_remove_name, 'product_cat')\n", " if term:\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if parent:\n", " product_categories = list(filter(lambda x: not re.search(parent['name'], x[0]['name'],\\\n", " flags=re.IGNORECASE), product_categories))\n", " #print(json.dumps(product_categories))\n", " if re.search('{add_category}', fix[1], flags=re.IGNORECASE):\n", " cats_to_add = re.sub('\\{add_category\\}', '', fix[1]).split(',')\n", " for cat_add in cats_to_add:\n", " found_cats = list(filter(lambda x: re.search(cat_add, x[0]['name'],\\\n", " flags=re.IGNORECASE), product_categories))\n", " if len(found_cats) == 0:\n", " cat_add_name = cat_add.strip()\n", " clean_cat = slugify(cat_add_name.strip())\n", " term = doesprodattrexist(jsonprodattr['product_cat'], cat_add_name, 'product_cat')\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], product_categories)):\n", " product_categories.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], product_categories)):\n", " product_categories.append((parent, False))\n", " else:\n", " term = {'term_id':-1, 'name':cat_add_name, 'slug':clean_cat, 'taxonomy':'product_cat'}\n", " product_categories.append((term, True))\n", " # --- Get color attributes from current scrape --- #\n", " if productmisc_array[(i-1)] == 'pa_color_html':\n", " colories = jsonprodattr['pa_color']\n", " colories_result = []\n", " for colorterm in colories:\n", " term_name = colorterm['name']\n", " color_html = str(productmisc_array[i])\n", " if color_html.upper().find(term_name.upper()) != -1:\n", " term = doesprodattrexist(jsonprodattr['pa_color'], colorterm['term_id'], 'pa_color')\n", " if term:\n", " colories_result.append((term, False))\n", " if colories_result:\n", " if product_colors == '':\n", " product_colors = colories_result\n", " else:\n", " product_colors = array_merge(product_colors, colories_result)\n", " # --- Has the product sold out yet? --- #\n", " if productmisc_array[(i-1)] == 'sold_out_html':\n", " selector_one_string_two = prodmisc_backup.split(',')\n", " if len(selector_one_string_two) > 1:\n", " #soldoutselect = browser.find_by_css(selector_one_string_two[0].strip().encode().decode(\"unicode-escape\"))\n", " soldoutselect = root.cssselect(str(selector_one_string_two[0]).strip().encode().decode(\"unicode-escape\"))\n", " #productmisc_array[i] = soldoutselect[0].html\n", " productmisc_array[i] = str(etree.tostring(soldoutselect[0]))\n", " if productmisc_array[i].find(selector_one_string_two[1]) != -1:\n", " soldouthtmlupdatemeta = True\n", " price = '0.0 BUCKS'\n", " price = price.replace(r'[^0-9,.]', '')\n", " price = getmoneyfromtext(price)\n", " else:\n", " soldouthtmlupdatemeta = False\n", " # --- Should we skip the first size alternative on information import? --- #\n", " if productmisc_array[(i-1)] == 'skip_first_size':\n", " if product_sizes != '':\n", " removed_size = product_sizes.pop(0)\n", " # >>> CHECK FOR PRODUCT PROPERITES IN TITLE(IF ENABLED) <<< #\n", " if website['lookforprodpropintitle'] == '1':\n", " try:\n", " termies = [[], [], []]\n", " termies[0] = jsonprodattr['pa_brand']\n", " termies[1] = jsonprodattr['pa_color']\n", " termies[2] = jsonprodattr['pa_sex']\n", " termies_result = [[], [], []]\n", " for i in range(3):\n", " for term in termies[i]:\n", " term_name = term['name']\n", " product_name = product['name']\n", " if product_name.upper().find(term_name.upper()) != -1:\n", " termies_result[i].append((doesprodattrexist(jsonprodattr[term['taxonomy']], term['term_id'], term['taxonomy']), False))\n", " if termies_result[0] and skip_exist_attr_prodtitle[0] != 1:\n", " brand_values = product_brand\n", " skip_domain_name = False\n", " if website['productmisc']:\n", " output = re.search(r'(skip_domainbrand_if_found)', website['productmisc'])\n", " if output is not None and len(output.group(0)) > 0:\n", " skip_domain_name = True\n", " if brand_values:\n", " #existing_brands = re.split(',\\s*', brand_values)\n", " existing_brands = brand_values\n", " ###count = 0\n", " ###for brand in existing_brands:\n", " ### existing_brands[count] = (brand, False)\n", " ### count+=1\n", " termies_result[0] = array_merge(termies_result[0], existing_brands)\n", " if skip_domain_name and domain_name != '' and len(termies_result[0]) > 1:\n", " termies_result[0] = list(filter(lambda x: x[0]['name'].upper().find(domain_name.upper()) == -1, termies_result[0]))\n", " product_brand = termies_result[0]\n", " if termies_result[1] and skip_exist_attr_prodtitle[1] != 1:\n", " color_values = product_colors\n", " if color_values:\n", " #existing_colors = re.split(',\\s*', color_values)\n", " existing_colors = color_values\n", " ###count = 0\n", " ###for color in existing_colors:\n", " ### existing_colors[count] = (color, False)\n", " ### count+=1\n", " termies_result[1] = array_merge(termies_result[1], existing_colors)\n", " product_colors = termies_result[1]\n", " if termies_result[2] and skip_exist_attr_prodtitle[2] != 1:\n", " sex_values = product_sex\n", " if sex_values:\n", " #existing_sex = re.split(',\\s*', sex_values)\n", " existing_sex = sex_values\n", " ###count = 0\n", " ###for sex in existing_sex:\n", " ### existing_sex[count] = (sex, False)\n", " ### count+=1\n", " termies_result[2] = array_merge(termies_result[2], existing_sex)\n", " product_sex = termies_result[2]\n", " # --> Look after categories in the product title!\n", " category_terms = jsonprodattr['product_cat']\n", " category_result = []\n", " for term in category_terms:\n", " term_name = term['name']\n", " product_name = product['name']\n", " array_categorymaps = jsoncatmaps\n", " leftside = '\\s' if whitespace_prodtitleregex_sides != 2 else ''\n", " rightside = '\\s' if whitespace_prodtitleregex_sides != 1 else ''\n", " if array_categorymaps:\n", " if term_name in array_categorymaps:\n", " infliction_array = jsoncatmaps[term_name]['catinflections'].split(',')\n", " for infliction in infliction_array:\n", " if no_whitespace_prodtitleregex is True:\n", " regex = ''+infliction.strip()+''\n", " else:\n", " if whitespace_prodtitleregex_sides == 3:\n", " regex = leftside+infliction.strip()+'|'+infliction.strip()+rightside\n", " else:\n", " regex = leftside+infliction.strip()+rightside\n", " if re.search(regex, product_name, flags=re.IGNORECASE):\n", " term = doesprodattrexist(jsonprodattr['product_cat'], term['term_id'], 'product_cat')\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], category_result)):\n", " category_result.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], category_result)):\n", " category_result.append((parent, False))\n", " if no_whitespace_prodtitleregex is True:\n", " regex = ''+term_name.strip()+''\n", " else:\n", " if whitespace_prodtitleregex_sides == 3:\n", " regex = leftside+term_name.strip()+'|'+term_name.strip()+rightside\n", " else:\n", " regex = leftside+term_name.strip()+rightside\n", " if re.search(regex, product_name, flags=re.IGNORECASE):\n", " term = doesprodattrexist(jsonprodattr['product_cat'], term['term_id'], 'product_cat')\n", " if term:\n", " if not list(filter(lambda x: x[0]['term_id'] == term['term_id'], category_result)):\n", " category_result.append((term, False))\n", " cat_parents = term['ancestors']\n", " for parent_id in cat_parents:\n", " parent = doesprodattrexist(jsonprodattr['product_cat'], parent_id, 'product_cat')\n", " if not list(filter(lambda x: x[0]['term_id'] == parent['term_id'], category_result)):\n", " category_result.append((parent, False))\n", " if category_result:\n", " existing_categories = product['category_ids'].copy()\n", " exist_cats = []\n", " #print(json.dumps(existing_categories))\n", " if existing_categories and skip_exist_attr_prodtitle[3] != 1:\n", " #count = 0\n", " for cat in existing_categories.copy():\n", " term = doesprodattrexist(jsonprodattr['product_cat'], cat, 'product_cat')\n", " #print(json.dumps(term))\n", " #print(cat)\n", " if (term['slug'] == 'uncategorized' and len(category_result) > 0)\\\n", " or list(filter(lambda x: x[0]['term_id'] == term['term_id'], category_result)):\n", " #del existing_categories[count]\n", " continue\n", " #existing_categories[count] = ((term, False))\n", " exist_cats.append((term, False))\n", " #count+=1 \n", " category_result = array_merge(category_result, exist_cats)\n", " #print('PRODCATSFINAL_PRODTITLE: ' + json.dumps(product_categories))\n", " #print('CATRESULTS_PRODTITLE: ' + json.dumps(category_result))\n", " if product_categories != '':\n", " #product_categories = array_merge(product_categories, category_result)\n", " for result in category_result:\n", " if not list(filter(lambda x: x[0]['term_id'] == result[0]['term_id'], product_categories)):\n", " product_categories.append(result)\n", " else:\n", " product_categories = category_result\n", " except:\n", " #print(\"Error when looking after prod. properties in title for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc()) \n", " # --> Fix sex for the product if it doesn't exist already! <-- #\n", " if product_sex == '':\n", " product_sex = [(doesprodattrexist(jsonprodattr['pa_sex'], 'Male', 'pa_sex'), False),\n", " (doesprodattrexist(jsonprodattr['pa_sex'], 'Female', 'pa_sex'), False)]\n", " # --> Fix categories for the product! <-- #\n", " if product_categories:\n", " existing_categories = product['category_ids'].copy()\n", " exist_cats = []\n", " if existing_categories and skip_exist_attr[6] != 1:\n", " #count = 0\n", " for cat in existing_categories.copy():\n", " term = doesprodattrexist(jsonprodattr['product_cat'], cat, 'product_cat')\n", " if term['slug'] == 'uncategorized' and len(product_categories) > 0:\n", " #del existing_categories[count]\n", " continue\n", " #existing_categories[count] = ((term, False))\n", " exist_cats.append((term, False))\n", " #count+=1\n", " #print('PRODCAT BEFORE: ' + json.dumps(product_categories))\n", " #print('EXISCAT BEFORE: ' + json.dumps(existing_categories))\n", " #product_categories = add_together_attrs(product_categories, existing_categories, 'product_cat')\n", " product_categories = add_together_attrs(product_categories, exist_cats, 'product_cat', jsonprodattr)\n", " #print('PRODCAT AFTER: ' + json.dumps(product_categories))\n", " #product_categories = product_categories + existing_categories \n", " #SAVE CAT. IDS AND PRODUCT HERE IN REMOTE SITE\n", " # --> Apply sizetype attributes where needed! <-- #\n", " product_sizetypemiscname = sizetypemisc\n", " if product_categories != '':\n", " array_catsizetypemaps = jsoncatsizetypemaps\n", " array_sizetypes = []\n", " for catsizetypemap in array_catsizetypemaps:\n", " finalcatsizetypes = catsizetypemap['finalcatsizetype'].split(',')\n", " catstosizetypes = catsizetypemap['catstosizetype'].split(',')\n", " product_category_names = []\n", " matching_cats = []\n", " for cat in product_categories:\n", " #print('CAT BEFORE SIZETYPEMISC: ' + json.dumps(cat))\n", " category_to_cast_id = cat[0]['term_id']\n", " term = doesprodattrexist(jsonprodattr['product_cat'], category_to_cast_id, 'product_cat')\n", " if term:\n", " if term['name'] not in product_category_names:\n", " #print('ADDING ' + term['name'] + ' TO ARRAY!')\n", " product_category_names.append(term['name'])\n", " for catstosizetype in catstosizetypes:\n", " #regex = u'(\\b.*' + catstosizetype.strip() + '\\b)'\n", " regex = '' + catstosizetype.strip() + ''\n", " filter_match = []\n", " filter_match = filter(lambda x: re.findall(regex, x, flags=re.IGNORECASE), product_category_names)\n", " #for match in filter_match: print('FILTER MATCH: ' + json.dumps(match))\n", " matching_cats = array_merge(matching_cats, list(filter_match))\n", " if matching_cats:\n", " #print('MATCHING CATS HERE!')\n", " size_type_terms = jsonprodattr['pa_sizetype'].copy()\n", " count = 0\n", " for size_type_term in size_type_terms:\n", " size_type_terms[count] = size_type_term['name']\n", " count+=1\n", " filtered_terms = []\n", " for finalcatsizetype in finalcatsizetypes:\n", " regex = '' + finalcatsizetype.strip() + ''\n", " filter_match = []\n", " filter_match = filter(lambda x: re.findall(regex, x, flags=re.IGNORECASE), size_type_terms)\n", " filtered_terms = array_merge(filtered_terms, list(filter_match))\n", " for filt_term in filtered_terms:\n", " #print('FILT TERM: ' + filt_term)\n", " term = doesprodattrexist(jsonprodattr['pa_sizetype'], filt_term, 'pa_sizetype')\n", " #print('SIZETYPE TO ADD: ' + json.dumps(term))\n", " if term:\n", " array_sizetypes.append((term, False)) \n", " product_sizetypes = array_sizetypes\n", " if not product_sizetypes and product_sizes and not product_sizetypemiscname:\n", " product_sizetypemiscname = 'Other'\n", " else:\n", " if product_sizes and not product_sizetypemiscname:\n", " product_sizetypemiscname = 'Other'\n", " if product_sizetypemiscname:\n", " product_sizetypemiscs = []\n", " term = doesprodattrexist(jsonprodattr['pa_sizetypemisc'], product_sizetypemiscname, 'pa_sizetypemisc')\n", " if term:\n", " product_sizetypemiscs.append((term, False))\n", " else:\n", " namus = product_sizetypemiscname.strip()\n", " slugus = product_sizetypemiscname.strip().lower()\n", " term = {'term_id':-1, 'name':namus, 'slug':slugus, 'taxonomy':'pa_sizetypemisc'}\n", " product_sizetypemiscs.append((term, True))\n", " # --> Fix/correct binds between existing product sizes and sizetypes(Including misc. sizetypes)! <-- #\n", " if product_sizetypes and product_sizes:\n", " # --> First: Map current sizes to pre-destined sizes depending on sizetype! <-- #\n", " #remove_remaining_sizes = False\n", " for sizemap in jsonsizemaps:\n", " sizemap_sizetypes = sizemap['sizetypestofilter'].split(',')\n", " for sizemap_sizetype in sizemap_sizetypes:\n", " sizemap_sizetype = re.sub(r'\\-\\d+', '', sizemap_sizetype.strip())\n", " for sizetype in product_sizetypes:\n", " if sizetype[0]['name'] == sizemap_sizetype:\n", " # --> Check if there are any sex-specific sizes to map!\n", " if len(product_sex) == 1:\n", " sex_name = product_sex[0][0]['name']\n", " split_sizetomaps = sizemap['sizestomap'].split(';')\n", " count = 0\n", " for sizetomap in split_sizetomaps.copy():\n", " if re.search(r'\\(M\\)', sizetomap) and sex_name == 'Male':\n", " split_sizetomaps[count] = re.sub(r'\\(M\\)', '', sizetomap)\n", " elif re.search(r'\\(F\\)', sizetomap) and sex_name == 'Female':\n", " split_sizetomaps[count] = re.sub(r'\\(F\\)', '', sizetomap)\n", " count += 1\n", " sizemap['sizestomap'] = ';'.join(split_sizetomaps)\n", " #print(sizemap['sizestomap'])\n", " # --> Check if there are any specific size handling to do!\n", " # --> !!! IMPORTANT ::: IF NUMBERS NEED TO BE SPLIT BY CHARACTER, MAKE SURE TO SPLIT THEM FIRST BEFORE THIS SECTION !!!\n", " if len(size_handling_options) > 1:\n", " for size_hand_opt in size_handling_options:\n", " if size_hand_opt[1] == sizetype[0]['name']:\n", " split_sizetomaps = sizemap['sizestomap'].split(';')\n", " count = 0\n", " for size in product_sizes.copy():\n", " continue_count = True\n", " if re.search(r'(\\d+\\,\\d|\\d+\\.\\d)', size[0]['name']):\n", " if size_hand_opt[0] == 0 or size_hand_opt[0] == 1:\n", " new_size_name = re.sub(r'(\\,\\d|\\.\\d)', '', size[0]['name'])\n", " if size_hand_opt[0] == 0:\n", " new_size_int = ''.join([i for i in size[0]['name'] if i.isdigit()])\n", " new_size_name = re.sub(r'd+', str(int(new_size_int) + 1), new_size_name)\n", " new_size_term = doesprodattrexist(jsonprodattr['pa_size'], new_size_name.strip(), 'pa_size')\n", " if new_size_term != 0:\n", " product_sizes.append((new_size_term, False))\n", " else:\n", " newsizename = new_size_name.strip()\n", " newsizeslug = slugify(newsizename.strip())\n", " new_size_term = {'term_id':-1, 'name':newsizename, 'slug':newsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_size_term, True))\n", " continue_count = False\n", " product_sizes.pop(count)\n", " if re.search(r'\\d\\/\\d', size[0]['name']):\n", " if size_hand_opt[0] == 2 or size_hand_opt[0] == 3:\n", " new_size_name = re.sub(r'\\d\\/\\d', '', size[0]['name'])\n", " if size_hand_opt[0] == 2:\n", " new_size_int = ''.join([i for i in size[0]['name'] if i.isdigit()])\n", " new_size_name = re.sub(r'd+', str(int(new_size_int) + 1), new_size_name)\n", " new_size_term = doesprodattrexist(jsonprodattr['pa_size'], new_size_name.strip(), 'pa_size')\n", " if new_size_term != 0:\n", " product_sizes.append((new_size_term, False))\n", " else:\n", " newsizename = new_size_name.strip()\n", " newsizeslug = slugify(newsizename.strip())\n", " new_size_term = {'term_id':-1, 'name':newsizename, 'slug':newsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_size_term, True))\n", " continue_count = False\n", " if re.search(r'd+', size[0]['name']):\n", " if size_hand_opt[0] == 4 or size_hand_opt[0] == 5:\n", " new_size_int = ''.join([i for i in size[0]['name'] if i.isdigit()])\n", " if int(new_size_int) > 0 and int(new_size_int) % 2 == 1:\n", " new_size_name = size[0]['name']\n", " if size_hand_opt[0] == 4:\n", " new_size_name = re.sub(r'd+', str(int(new_size_int) + 1), sizetomap)\n", " elif size_hand_opt[0] == 5:\n", " new_size_name = re.sub(r'd+', str(int(new_size_int) - 1), sizetomap)\n", " new_size_term = doesprodattrexist(jsonprodattr['pa_size'], new_size_name.strip(), 'pa_size')\n", " if new_size_term != 0:\n", " product_sizes.append((new_size_term, False))\n", " else:\n", " newsizename = new_size_name.strip()\n", " newsizeslug = slugify(newsizename.strip())\n", " new_size_term = {'term_id':-1, 'name':newsizename, 'slug':newsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_size_term, True))\n", " continue_count = False\n", " if size_hand_opt[0] in range(6, 9):\n", " split_char = size_hand_opt[2].strip() if size_hand_opt[2] else '/'\n", " continue_split = True\n", " if split_char.strip() == 'x':\n", " if not re.search(r'.*\\d+.*x.*\\d+.*', size[0]['name']):\n", " continue_split = False\n", " if continue_split == True:\n", " newsizes = size[0]['name'].split(split_char)\n", " if len(newsizes) > 1:\n", " if size_hand_opt[0] == 7:\n", " removed_size = newsizes.pop()\n", " elif size_hand_opt[0] == 8:\n", " removed_size = newsizes.pop(0)\n", " for newsize in newsizes:\n", " new_size_term = doesprodattrexist(jsonprodattr['pa_size'], newsize.strip(), 'pa_size')\n", " if new_size_term != 0:\n", " product_sizes.append((new_size_term, False))\n", " else:\n", " newsizename = newsize.strip()\n", " newsizeslug = slugify(newsizename.strip())\n", " new_size_term = {'term_id':-1, 'name':newsizename, 'slug':newsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_size_term, True))\n", " continue_count = False\n", " if continue_count == True:\n", " count += 1\n", " sizemap['sizestomap'] = ';'.join(split_sizetomaps)\n", " #print(sizemap['sizestomap'])\n", " #found_sizenames = []\n", " #split_sizetomaps = sizemap['sizestomap'].split(',')\n", " #for sizetomap in split_sizetomaps.copy():\n", " #found_sizenames = list(filter(lambda x: re.search(x[0]['name'], sizemap['sizestomap']), product_sizes))\n", " #for prod_size in product_sizes:\n", " # found_sizenames = list(filter(lambda x: prod_size[0]['name'] == x, sizemap['sizestomap']))\n", " enforce_mandatory_sizes = True\n", " split_sizetomaps = sizemap['sizestomap'].split(';')\n", " for sizetomap in split_sizetomaps.copy():\n", " found_sizenames = list(filter(lambda x: x[0]['name'].strip().lower() == sizetomap.strip().lower(), product_sizes))\n", " if found_sizenames:\n", " #for size_to_remove in sizemap['sizestomap'].split(','):\n", " for size_to_remove in split_sizetomaps:\n", " size_to_remove = size_to_remove.strip().lower()\n", " product_sizes = list(filter(lambda x: x[0]['name'].strip().lower() != size_to_remove, product_sizes))\n", " enforce_mandatory_sizes = False\n", " finalterm = doesprodattrexist(jsonprodattr['pa_size'], sizemap['finalsize'].strip(), 'pa_size')\n", " if finalterm != 0:\n", " product_sizes.append((finalterm, False))\n", " else:\n", " finalsizename = sizemap['finalsize'].strip()\n", " finalsizeslug = slugify(finalsizename.strip())\n", " new_finalterm = {'term_id':-1, 'name':finalsizename, 'slug':finalsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_finalterm, True))\n", " #print(json.dumps(product_sizes))\n", " break\n", " # --> Do we need to add any mandatory sizes depending on sizetype?\n", " if len(mandatory_sizes) > 0 and (len(product_sizes) == 0 or enforce_mandatory_sizes == True):\n", " for mandsize in mandatory_sizes:\n", " if mandsize[0] != '' and mandsize[1] != '':\n", " if sizetype[0]['name'] == mandsize[1].strip():\n", " product_sizes = []\n", " for size in mandsize[0]:\n", " new_size_term = doesprodattrexist(jsonprodattr['pa_size'], size.strip(), 'pa_size')\n", " if new_size_term != 0:\n", " product_sizes.append((new_size_term, False))\n", " else:\n", " newsizename = size.strip()\n", " newsizeslug = slugify(newsizename.strip())\n", " new_size_term = {'term_id':-1, 'name':newsizename, 'slug':newsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_size_term, True))\n", " # --> Should the current product sizes be mapped into any final sizes?\n", " if apply_finalsize_as_size == True:\n", " add_finalsize_now = False\n", " split_sizetomaps = sizemap['sizestomap'].split(';')\n", " #found_sizenames = []\n", " for sizetomap in split_sizetomaps.copy():\n", " current_found_sizenames = list(filter(lambda x: x[0]['name'].strip().lower() == sizetomap.strip().lower(), product_sizes))\n", " if current_found_sizenames:\n", " #found_sizenames = current_found_sizenames\n", " add_finalsize_now = True\n", " break\n", " if add_finalsize_now == True: \n", " new_size_term = doesprodattrexist(jsonprodattr['pa_size'], sizemap['finalsize'].strip(), 'pa_size')\n", " if new_size_term != 0:\n", " product_sizes.append((new_size_term, False))\n", " else:\n", " newsizename = sizemap['finalsize'].strip()\n", " newsizeslug = slugify(newsizename.strip())\n", " new_size_term = {'term_id':-1, 'name':newsizename, 'slug':newsizeslug, 'taxonomy':'pa_size'}\n", " product_sizes.append((new_size_term, True))\n", " for sizetomap in split_sizetomaps.copy():\n", " product_sizes = list(filter(lambda x: x[0]['name'].strip().lower() != sizetomap.strip().lower() or sizetomap.strip().lower() == sizemap['finalsize'].strip().lower(), product_sizes))\n", " # --> Second: Correct binds between sizes and sizetypes/sizetypemiscs!\n", " sizeid_col = product['sizetosizetypemaps']['size']\n", " sizetypeid_col = product['sizetosizetypemaps']['sizetype']\n", " product_size_col = []\n", " product_sizetype_col = []\n", " #for e in range(0, len(product_sizes)): product_size_col[e] = product_sizes[e][0]['term_id']\n", " #for e in range(0, len(product_sizetypes)): product_sizetype_col[e] = product_sizetypes[e][0]['term_id']\n", " for s in product_sizes: product_size_col.append(s[0]['term_id'])\n", " for s in product_sizetypes: product_sizetype_col.append(s[0]['term_id'])\n", " #count = 0\n", " #for sizeid in sizeid_col:\n", " # sizeid_col[count] = (doesprodattrexist(jsonprodattr['pa_size'], sizeid, 'pa_size'), False)\n", " # count+=1\n", " #count = 0\n", " #for sizetypeid in sizetypeid_col:\n", " # sizetypeid_col[count] = (doesprodattrexist(jsonprodattr['pa_sizetype'], sizetypeid, 'pa_sizetype'), False)\n", " # count+=1\n", " #CONVERT COMPARE ARRAYS\n", " #sizeid_col = [value for key, value in sizeid_col.items() if key == '']\n", " #sizetypeid_col = [value for key, value in sizetypeid_col.items() if key == '']\n", " #product_size_col = [value for key, value in product_sizes.items() if key == '']\n", " #product_sizetype_col = [value for key, value in product_sizetypes.items() if key == '']\n", " #SAVE VALUES FOR INSERT\n", " compare_sizeid = list(set(product_size_col) - set(sizeid_col))\n", " compare_sizetypeid = list(set(product_sizetype_col) - set(sizetypeid_col))\n", " if compare_sizetypeid and compare_sizeid:\n", " insert_sizetosizetype = []\n", " for sizetypeid_insert in compare_sizetypeid:\n", " #count = 0\n", " for sizeid_insert in compare_sizeid:\n", " insert_sizetosizetype.append((sizeid_insert, sizetypeid_insert, product['productid']))\n", " #insert_sizetosizetype[count] = (sizeid_insert, sizetypeid_insert, product['productid'])\n", " #count+=1\n", " #SAVE VALUES FOR REMOVAL\n", " compare_sizeid = list(set(sizeid_col) - set(product_size_col))\n", " compare_sizetypeid = list(set(sizetypeid_col) - set(product_sizetype_col))\n", " if compare_sizetypeid and compare_sizeid:\n", " remove_sizetosizetype = []\n", " for sizetypeid_remove in compare_sizetypeid:\n", " #count = 0\n", " for sizeid_remove in compare_sizeid:\n", " remove_sizetosizetype.append((sizeid_remove, sizetypeid_remove, product['productid']))\n", " #remove_sizetosizetype[count] = (sizeid_remove, sizetypeid_remove, product['productid'])\n", " #count+=1\n", " if product_sizetypemiscs and product_sizes:\n", " sizeid_col = product['sizetosizetypemaps']['size_misc']\n", " compare_sizetypemiscid = product['sizetosizetypemaps']['sizetype_misc']\n", " product_size_col = []\n", " product_sizetypemisc_col = []\n", " #for e in range(0, len(product_sizes)): product_size_col[e] = product_sizes[e][0]['term_id']\n", " #for e in range(0, len(product_sizetypes)): product_sizetypemisc_col[e] = product_sizetypemiscs[e][0]['term_id']\n", " for s in product_sizes: product_size_col.append(s[0]['term_id'])\n", " for s in product_sizetypemiscs: product_sizetypemisc_col.append(s[0]['term_id'])\n", " #count = 0\n", " #for sizeid in sizeid_col:\n", " # sizeid_col[count] = (doesprodattrexist(jsonprodattr['pa_size'], sizeid, 'pa_size'), False)\n", " # count+=1\n", " #count = 0\n", " #for sizetypemiscid in compare_sizetypemiscid:\n", " # compare_sizetypemiscid[count] = (doesprodattrexist(jsonprodattr['pa_sizetypemisc'], sizetypemiscid, 'pa_sizetypemisc'), False)\n", " # count+=1\n", " #SAVE VALUES FOR INSERT\n", " compare_sizeid = list(set(product_size_col) - set(sizeid_col))\n", " compare_sizetypemiscid = list(set(product_sizetypemisc_col) - set(compare_sizetypemiscid)) \n", " if compare_sizetypemiscid and compare_sizeid:\n", " insert_sizetosizetypemisc = []\n", " for sizetypemiscid_insert in compare_sizetypemiscid:\n", " #count = 0\n", " for sizeid_insert in compare_sizeid:\n", " insert_sizetosizetypemisc.append((sizeid_insert, sizetypemiscid_insert, product['productid']))\n", " #insert_sizetosizetypemisc[count] = (sizeid_insert, sizetypemiscid_insert, product['productid'])\n", " #count+=1\n", " #SAVE VALUES FOR REMOVAL\n", " compare_sizeid = list(set(sizeid_col) - set(product_size_col))\n", " compare_sizetypemiscid = list(set(compare_sizetypemiscid) - set(product_sizetypemisc_col))\n", " if compare_sizetypemiscid and compare_sizeid:\n", " remove_sizetosizetypemisc = []\n", " for sizetypemiscid_remove in compare_sizetypemiscid:\n", " #count = 0\n", " for sizeid_remove in compare_sizeid:\n", " remove_sizetosizetypemisc.append((sizeid_remove, sizetypemiscid_remove, product['productid']))\n", " #remove_sizetosizetypemisc[count] = (sizeid_remove, sizetypemiscid_remove, product['productid'])\n", " #count+=1 \n", " # --> Apply color, size, sex and brand to the product! (Filter the attributes before save)\n", " # --> (Filter the attributes before database save)\n", " attributes = []\n", " attribute_pos = 1 \n", " if product_brand:\n", " skip_domain_name = False\n", " if website['productmisc']:\n", " output = re.search(r'(skip_domainbrand_if_found)', website['productmisc'])\n", " if output is not None and len(output.group(0)) > 0:\n", " skip_domain_name = True\n", " brand_values = product['attributes']['brand']\n", " if brand_values and skip_exist_attr[0] != 1:\n", " existing_brands = re.split(',\\s*', brand_values)\n", " exist_brands = []\n", " #count = 0\n", " for brand in existing_brands.copy():\n", " '''if skip_domain_name is True:\n", " if domain_name != '':\n", " if brand.upper().find(domain_name.upper()) != -1:\n", " del existing_brands[count]\n", " continue\n", " brand = doesprodattrexist(jsonprodattr['pa_brand'], brand, 'pa_brand')\n", " existing_brands[count] = (brand, False)\n", " count+=1'''\n", " if skip_domain_name is True:\n", " if domain_name != '':\n", " if brand.upper().find(domain_name.upper()) != -1:\n", " #del existing_brands[count]\n", " continue\n", " brand = doesprodattrexist(jsonprodattr['pa_brand'], brand, 'pa_brand')\n", " notlist = list(filter(lambda x: x[0]['name'].lower() == brand['name'].lower(), exist_brands))\n", " if not notlist:\n", " exist_brands.append((brand, False))\n", " else:\n", " #del existing_brands[count]\n", " exist_brands = list(filter(lambda x: x[0]['name'].lower() != brand['name'].lower(), exist_brands))\n", " #exist_brands.append(notlist[0])\n", " continue\n", " #count+=1\n", " if skip_domain_name is True and len(product_brand) > 0 and len(exist_brands) > 0:\n", " product_brand = exist_brands\n", " else:\n", " #product_brand = product_brand + existing_brands\n", " product_brand = add_together_attrs(product_brand, exist_brands, 'pa_brand', jsonprodattr)\n", " #print('FINAL BRANDS: ' + json.dumps(product_brand))\n", " attributes.append({'name':'Brand', 'options':product_brand, 'position':attribute_pos, 'visible':1, 'variation':1})\n", " attribute_pos+=1\n", " if product_colors:\n", " color_values = product['attributes']['color']\n", " if color_values and skip_exist_attr[1] != 1:\n", " existing_colors = re.split(',\\s*', color_values)\n", " count = 0\n", " for color in existing_colors:\n", " color = doesprodattrexist(jsonprodattr['pa_color'], color, 'pa_color')\n", " existing_colors[count] = (color, False)\n", " count+=1\n", " #product_colors = product_colors + existing_colors\n", " product_colors = add_together_attrs(product_colors, existing_colors, 'pa_color', jsonprodattr)\n", " attributes.append({'name':'Color', 'options':product_colors, 'position':attribute_pos, 'visible':1, 'variation':1})\n", " attribute_pos+=1\n", " if product_sex:\n", " sex_values = product['attributes']['sex']\n", " if sex_values and skip_exist_attr[2] != 1:\n", " existing_sex = re.split(',\\s*', sex_values)\n", " count = 0\n", " for sex in existing_sex:\n", " sex = doesprodattrexist(jsonprodattr['pa_sex'], sex, 'pa_sex')\n", " existing_sex[count] = (sex, False)\n", " #existing_sex[count] = sex['term_id']\n", " count+=1\n", " #product_sex = product_sex + existing_sex\n", " #print('FINAL SEX BEFORE: ' + json.dumps(product_sex))\n", " product_sex = add_together_attrs(product_sex, existing_sex, 'pa_sex', jsonprodattr)\n", " #print('FINAL SEX AFTER: ' + json.dumps(product_sex))\n", " attributes.append({'name':'Sex', 'options':product_sex, 'position':attribute_pos, 'visible':1, 'variation':1})\n", " attribute_pos+=1\n", " if product_sizes:\n", " size_values = product['attributes']['size']\n", " if size_values and skip_exist_attr[3] != 1:\n", " existing_sizes = re.split(',\\s*', size_values)\n", " count = 0\n", " for size in existing_sizes:\n", " size = doesprodattrexist(jsonprodattr['pa_size'], size, 'pa_size')\n", " existing_sizes[count] = (size, False)\n", " count+=1\n", " #product_sizes = product_sizes + existing_sizes\n", " #print('FINAL SIZES BEFORE: ' + json.dumps(product_sizes))\n", " product_sizes = add_together_attrs(product_sizes, existing_sizes, 'pa_size', jsonprodattr)\n", " #print('FINAL SIZES AFTER: ' + json.dumps(product_sizes))\n", " attributes.append({'name':'Size', 'options':product_sizes, 'position':attribute_pos, 'visible':1, 'variation':1})\n", " attribute_pos+=1\n", " if product_sizetypes:\n", " sizetype_values = product['attributes']['sizetype']\n", " if sizetype_values and skip_exist_attr[4] != 1:\n", " existing_sizetypes = re.split(',\\s*', sizetype_values)\n", " count = 0\n", " for sizetype in existing_sizetypes:\n", " sizetype = doesprodattrexist(jsonprodattr['pa_sizetype'], sizetype, 'pa_sizetype')\n", " existing_sizetypes[count] = (sizetype, False)\n", " count+=1\n", " #product_sizetypes = product_sizetypes + existing_sizetypes\n", " product_sizetypes = add_together_attrs(product_sizetypes, existing_sizetypes, 'pa_sizetype', jsonprodattr)\n", " attributes.append({'name':'Sizetype', 'options':product_sizetypes, 'position':attribute_pos, 'visible':1, 'variation':1})\n", " attribute_pos+=1\n", " if product_sizetypemiscs:\n", " sizetypemisc_values = product['attributes']['sizetypemisc']\n", " if sizetypemisc_values and skip_exist_attr[5] != 1:\n", " existing_sizetypemiscs = re.split(',\\s*', sizetypemisc_values)\n", " count = 0\n", " for sizetypemisc in existing_sizetypemiscs:\n", " sizetypemisc = doesprodattrexist(jsonprodattr['pa_sizetypemisc'], sizetypemisc, 'pa_sizetypemisc')\n", " existing_sizetypemiscs[count] = (sizetypemisc, False)\n", " count+=1\n", " #product_sizetypemiscs = product_sizetypemiscs + existing_sizetypemiscs\n", " product_sizetypemiscs = add_together_attrs(product_sizetypemiscs, existing_sizetypemiscs, 'pa_sizetypemisc', jsonprodattr)\n", " attributes.append({'name':'Sizetypemisc', 'options':product_sizetypemiscs, 'position':attribute_pos, 'visible':1, 'variation':1})\n", " attribute_pos+=1\n", " attributes_to_store = attributes\n", " catstoaddresult = product_categories\n", " # --- Make sure to empty all the already-checked bits and join the productmisc. bits back together! --- #\n", " ###\n", " ###\n", " #pass\n", " except:\n", " #print(\"Error when scraping misc. product information for product ID \" + product['productid'] + \": \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " # >>> MAKE PRICES NUMERIC <<< #\n", " #if not re.match('[0-9]', price): price = ''\n", " #if not re.match('[0-9]', salesprice): salesprice = ''\n", " price = getmoneyfromtext(price)\n", " salesprice = getmoneyfromtext(salesprice)\n", " # >>> STORE PRODUCT VALUES IN MORPH.IO DATABASE <<< #\n", " if skipfinalsave == False:\n", " scraperwiki.sqlite.save(unique_keys=['productid'],\\\n", " data={'productid': product['productid'],\\\n", " 'url': product['url'],\\\n", " 'domain': product['domain'],\\\n", " 'price': price,\\\n", " 'salesprice': salesprice,\\\n", " 'domainmisc': json.dumps(domainmisc_array),\\\n", " 'prodlogurls': json.dumps(prodlog_image_urls),\\\n", " 'prodlogurl': productlogourl,\\\n", " 'finalimgurls': json.dumps(images),\\\n", " 'validimgurls': json.dumps(image_urls_valid),\\\n", " 'imgurls': json.dumps(image_urls),\\\n", " 'notfound': notfound,\\\n", " 'notavailable': notavailable,\\\n", " 'removeon404': shouldremoveonnotfound,\\\n", " 'soldoutfix': soldoutupdatemeta,\\\n", " 'soldouthtmlfix': soldouthtmlupdatemeta,\\\n", " 'catstoaddresult': json.dumps(catstoaddresult),\\\n", " 'attributes': json.dumps(attributes_to_store),\\\n", " 'sizetypemapsqls': json.dumps([insert_sizetosizetype,\\\n", " remove_sizetosizetype,\\\n", " insert_sizetosizetypemisc,\\\n", " remove_sizetosizetypemisc])})\n", " #browser.quit()\n", " totalscrapedcount = totalscrapedcount + 1\n", " except WebDriverException:\n", " print('Chrome not running properly - The product will be rescraped again!')\n", " jsonprods.append(product)\n", " time.sleep(2)\n", " continue\n", " except:\n", " #print(\"Error: \" + sys.exc_info()[0] + \" occured!\")\n", " print(traceback.format_exc())\n", " continue\n", " else:\n", " continue\n", " if website['productmisc'] != '':\n", " website['productmisc'] = orig_prodmisc\n", " offset = offset + limit\n", " amount_processed = amount_processed + limit\n", " #if offset < maxlimit or maxlimit == 0:\n", " if amount_processed < maxlimit or maxlimit == 0:\n", " r = requests.get(wp_connectwp_url + str(offset) + '/' + str(limit) + '/', headers=headers)\n", " jsonprods = r.json()\n", " #print(str(offset) + ' products has been scraped so far!')\n", " #print(str(totalscrapedcount) + ' products has been scraped so far!')\n", " else:\n", " jsonprods = None\n", "\n", "# --- EXECUTION SECTION --- # \n", " \n", "def savecurrfiltodb():\n", " scraperwiki.sqlite.execute(\"drop table if exists filestoexport\")\n", " with open(__file__, 'r') as file:\n", " try:\n", " file_text = json.dumps(file.readlines())\n", " filusid = '1'\n", " scraperwiki.sqlite.save(table_name = 'filestoexport', unique_keys=['file_id'], data={'file_id': filusid, 'file_cont': file_text})\n", " #time.sleep(10)\n", " print('Current file module export successful!')\n", " except:\n", " print(traceback.format_exc()) \n", " \n", "#mainfunc(0)\n", "savecurrfiltodb()\n", "#EOF\n"]
|
Average successful run time: 5 minutes
Total run time: 9 days
Total cpu time used: about 4 hours
Total disk space used: 42.6 MB