Scrapy Basic tutorial to scrape a site using peewee, grabe and store data to database using pipelines
Spider:
import scrapy
from ..items import TutorialItem
from ..db_model import *
class QuotesSpider(scrapy.Spider):
name = "quotes"
url_lists = []
with database.atomic():
query = Links.select().where((Links.link.contains("pet-supplies")) & (Links.status == ""))
for data in query:
url_lists.append(data.link)
query = Links.update(status="fetched").where(Links.id == data.id)
query.execute() # Returns the number of rows that were updated.
start_urls = url_lists
def parse(self, response):
items = TutorialItem()
all_cards = response.css(".col-xl-9 .card ")
page = response.url.split("/")
for quote in all_cards:
items['category'] = page[-2]
items['sub_category'] = page[-1]
items['link'] = quote.css(".card-body::attr(href)").extract_first()
items['title'] = quote.css(".card-title::text").get()
items['image'] = quote.css(".avatar::attr(src)").extract_first()
items['products'] = quote.css(".card-text::text").extract_first()
yield items
2.Items
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
image = scrapy.Field()
products = scrapy.Field()
category = scrapy.Field()
sub_category = scrapy.Field()
link = scrapy.Field()
3.pipeline
from itemadapter import ItemAdapter
from db_model import *
class TutorialPipeline:
def process_item(self, item, spider):
print("pipeline", item['title'], item['image'], item['products'])
insert_it = Products(link=item['link'], title=item['title'], category=item['category'],
sub_category=item['sub_category'], image=item['image'], products_count=item['products'],status="1")
insert_it.save()
return item
4. activate pipeline in settings.py, when you add this code all data in items.py will be sent to pipeline
ITEM_PIPELINES = {
"tutorial.pipelines.TutorialPipeline": 300,
}
Also please install peewee module to interact with database.