Web developer: Фреймворк Scrapy

Scrapy — это прикладная среда для сканирования веб-сайтов и извлечения структурированных данных, которая может использоваться для широкого спектра полезных приложений, таких как интеллектуальный анализ данных, обработка информации или историческое архивирование.

Устанавливаем библиотеку:

pip install scrapy

Создаем файл spider.py:

import scrapy

class QuotesSpider(scrapy.Spider):

name = 'quotes'

start_urls = [

'http://quotes.toscrape.com/tag/humor/',

]

def parse(self, response):

for quote in response.css('div.quote'):

yield {

'author': quote.xpath('span/small/text()').get(),

'text': quote.css('span.text::text').get()

.replace("\u201c", '').replace("\u201d", '') # убираем кавычки

.replace("\u2019", "'"), # ставим апостроф, вместо кода

}

next_page = response.css('li.next a::attr("href")').get()

if next_page is not None:

yield response.follow(next_page, self.parse)

И запускаем паука на обработку. (Внимание!!! Использовался статичный сайт.)

scrapy runspider spider.py -O quotes.json

Возможные выходные форматы файлов

*.jl - json line - json в линию

*.json

*.xml

*.csv - разделитель запятая (не точка с запятой!)

jsonlines, jsonl, marshal, pickle - все поддерживающие

Параметр -o, -O:

-o - дописывает файл

-O - перезаписывает файл

Извлечение данных с помощью селекторов

def parse(self, response):

# получить url

page = response.url

a = response.css('title')

# [<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]

response.css('title::text').getall()

# ['Quotes to Scrape']

response.css('title').getall()

# ['<title>Quotes to Scrape</title>']

response.css('title::text').get()

# 'Quotes to Scrape'

response.css('title::text')[0].get() # возможна ошибка если элементов нет

# 'Quotes to Scrape'

response.css('title::text').re(r'(\w+) to (\w+)')

# ['Quotes', 'Scrape']

response.xpath('//title')

# [<Selector xpath='//title' data='<title>Quotes to Scrape</title>'>]

response.xpath('//title/text()').get()

# 'Quotes to Scrape'

quote = response.css("div.quote")[0] # получим первый div с классом quote

quote.css("span.text::text").get() # получим текст

response.css('li.next a').get()

# '<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'

response.css('li.next a::attr(href)').get()

# '/page/2/'

response.css('li.next a').attrib['href'] # нужно проверить

# '/page/2/'

response.selector.xpath('//span/text()').get()

response.xpath('//span/text()').get()

response.css('img').xpath('@src').getall()

response.xpath('//div[@id="images"]/a/text()').get()

response.css('img').attrib['src']

response.xpath('//base/@href').get()

response.xpath('//a[contains(@href, "image")]/@href').getall()

response.css('a[href*=image]::attr(href)').getall()

response.xpath('//div[@id=$val]/a/text()', val='images').get()

Использование аргументов паука

Вы можете предоставить своим паукам аргументы командной строки, используя параметр -a при их запуске:

scrapy crawl quotes -O quotes-humor.json -a tag=humor

Данные аргументы передаются методу паука __init__ и по умолчанию становятся атрибутами паука.

В этом примере значение, указанное для аргумента tag, будет доступно через self.tag. Вы можете использовать его, чтобы ваш паук выбирал только цитаты с определённым тегом, создавая URL-адрес на основе аргумента:

import scrapy

class QuotesSpider(scrapy.Spider):

name = "quotes"

def start_requests(self):

url = 'http://quotes.toscrape.com/'

tag = getattr(self, 'tag', None)

if tag is not None:

url = url + 'tag/' + tag

yield scrapy.Request(url, self.parse)

def parse(self, response):

for quote in response.css('div.quote'):

yield {

'text': quote.css('span.text::text').get(),

'author': quote.css('small.author::text').get(),

}

next_page = response.css('li.next a::attr(href)').get()

if next_page is not None:

yield response.follow(next_page, self.parse)

Если вы передадите этому пауку аргумент tag=humor, вы заметите, что он будет посещать только URL-адреса из тега humor, например http://quotes.toscrape.com/tag/humor.

Динамический сайт

class DynamicSpider(scrapy.Spider):

name = 'products'

url = [

'https://www.lazada.sg/products/esogoal-tactical-sling-bag-outdoor-chest-pack-shoulder-backpack-military-sport-bag-for-trekking-camping-hiking-rover-sling-daypack-for-men-women-i204814494-s353896924.html?mp=1',

'https://www.lazada.sg/products/esogoal-selfie-stick-tripod-extendable-selfie-stick-monopod-with-integrated-tripod-and-bluetooth-remote-shutter-wireless-selfie-stick-tripod-for-cellphonecameras-i205279097-s309050125.html?mp=1',

'https://www.lazada.sg/products/esogoal-mini-umbrella-travel-umbrella-sun-rain-umbrella8-ribs-98cm-big-surface-lightweight-compact-parasol-uv-protection-for-men-women-i204815487-s308312226.html?mp=1',

'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',

]

script = """

function main(splash, args)

assert(splash:go(args.url))

assert(splash:wait(2.5))

return {

html = splash:html()

}

end

"""

def start_requests(self):

for link in self.url:

yield SplashRequest(

url=link,

callback=self.parse,

endpoint='execute',

args={'wait': 1.5, 'lua_source': self.script},

dont_filter=True,

)

def parse(self, response):

yield {

'title': response.xpath("//span[@class='pdp-mod-product-badge-title']/text()").extract_first(),

'price': response.xpath("//span[contains(@class, 'pdp-price')]/text()").extract_first(),

'description': response.xpath("//div[@id='module_product_detail']/h2/text()").extract_first()

}

Фреймворк Scrapy

Извлечение данных с помощью селекторов

Использование аргументов паука

Как установить buildozer для kivy