I'm new to Python and web scraping. I found these 2 programs on the internet. I'm trying to figure out how to add random proxy generating from def get_proxies() to def webParser() method. So each url request will get random proxy address. Is this possible? Please help.
from lxml import html
import csv, os, json
import requests
from lxml.html import fromstring
from itertools import cycle
#from exceptions import ValueError
import sys
from time import sleep
def get_proxies():
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
for i in parser.xpath('//tbody/tr')[:10]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
# Grabbing IP and corresponding PORT
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
return proxies
proxies = get_proxies()
proxy_pool = cycle(proxies)
url = 'https://httpbin.org/ip'
for i in range(1,11):
#Get a proxy from the pool
proxy = next(proxy_pool)
print("Request #%d"%i)
try:
response = requests.get(url,proxies={"http": proxy, "https": proxy})
print(response.json())
except:
#Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
#We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url
print("Skipping. Connnection error")
def webParser(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url, headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
if page.status_code != 200:
raise ValueError('captha')
data = {
'NAME': NAME,
'SALE_PRICE': SALE_PRICE,
'CATEGORY': CATEGORY,
'ORIGINAL_PRICE': ORIGINAL_PRICE,
'AVAILABILITY': AVAILABILITY,
'URL': url,
}
return data
except Exception as e:
print(e)
def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = ['ITM001',
'ITM002',
'ITM003',
'ITM004',
'ITM005'
]
extracted_data = []
for i in AsinList:
url = "http://www.ecomsite.com/item/" + i
print
"Processing: " + url
extracted_data.append(webParser(url))
sleep(5)
f = open('data.json', 'w')
json.dump(extracted_data, f, indent=4)
if __name__ == "__main__":
ReadAsin()
Aucun commentaire:
Enregistrer un commentaire