幫朋友制作一個網(wǎng)站,需要一些產(chǎn)品數(shù)據(jù)信息,因為是代理其他公司產(chǎn)品,直接爬取代理公司產(chǎn)品數(shù)據(jù)
1.設計數(shù)據(jù)庫
from django.db import models
from uuslug import slugify
import uuid
import os
def products_directory_path(instance, filename):
ext = filename.split('.')[-1]
filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
# return the whole path to the file
return os.path.join('images', "products", instance.title, filename)
def product_relatedimage_directory_path(instance, filename):
ext = filename.split('.')[-1]
filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
# return the whole path to the file
return os.path.join('images', "product_relatedimage", instance.product.title, filename)
class ProductsCategory(models.Model):
"""產(chǎn)品分類"""
name = models.CharField('產(chǎn)品分類名', max_length=80, unique=True)
description = models.TextField('產(chǎn)品分類描述', blank=True, null=True)
slug = models.SlugField('slug', max_length=80, blank=True, null=True)
parent_category = models.ForeignKey('self', verbose_name="父級分類", blank=True, null=True, on_delete=models.CASCADE)
def save(self, *args, **kwargs):
if not self.id or not self.slug:
self.slug = slugify(self.name)
super().save(*args, **kwargs)
def __str__(self):
return self.name
class Meta:
ordering = ['name']
verbose_name = "產(chǎn)品分類"
verbose_name_plural = verbose_name
class ProductsTag(models.Model):
"""產(chǎn)品標簽"""
name = models.CharField('產(chǎn)品標簽名', max_length=30, unique=True)
slug = models.SlugField('slug', max_length=40)
def __str__(self):
return self.name
def save(self, *args, **kwargs):
if not self.id or not self.slug:
self.slug = slugify(self.name)
super().save(*args, **kwargs)
class Meta:
ordering = ['name']
verbose_name = "產(chǎn)品標簽"
verbose_name_plural = verbose_name
class Product(models.Model):
title = models.CharField('標題', max_length=255, unique=True)
slug = models.SlugField('slug', max_length=255, blank=True, null=True)
jscs = models.TextField('技術(shù)參數(shù)', blank=True, null=True)
image = models.ImageField(upload_to=products_directory_path, verbose_name="產(chǎn)品圖片")
views = models.PositiveIntegerField('瀏覽量', default=0)
category = models.ForeignKey('ProductsCategory', verbose_name='分類', on_delete=models.CASCADE, blank=True, null=True)
tags = models.ManyToManyField('ProductsTag', verbose_name='標簽集合', blank=True)
def save(self, *args, **kwargs):
if not self.id or not self.slug:
self.slug = slugify(self.title)
super().save(*args, **kwargs)
def update_views(self):
self.views += 1
self.save(update_fields=['views'])
def get_pre(self):
return Product.objects.filter(id__lt=self.id).order_by('-id').first()
def get_next(self):
return Product.objects.filter(id__gt=self.id).order_by('id').first()
def __str__(self):
return self.title
class Meta:
verbose_name = "產(chǎn)品"
verbose_name_plural = verbose_name
class ProductAdvantage(models.Model):
content = models.TextField('產(chǎn)品優(yōu)勢', blank=True, null=True)
product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)
def __str__(self):
return self.content
class Meta:
verbose_name = "產(chǎn)品優(yōu)勢"
verbose_name_plural = verbose_name
class ProductBody(models.Model):
body = models.CharField('產(chǎn)品內(nèi)容', max_length=256, blank=True, null=True)
product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)
def __str__(self):
return self.product.title
class Meta:
verbose_name = "產(chǎn)品內(nèi)容"
verbose_name_plural = verbose_name
2.腳本編寫
2.1編寫獲取網(wǎng)頁源代碼函數(shù)
def get_one_page(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'
if res.status_code == 200:
return res.text
else:
return None
except Exception:
return None
2.2根據(jù)base頁面獲取所有產(chǎn)品分類頁面鏈接
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 處理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
print(url)
2.3根據(jù)產(chǎn)品分類頁面鏈接獲取對應所有產(chǎn)品鏈接
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類
catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
print("產(chǎn)品分類:" + catgory[0])
# 該分類下產(chǎn)品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 處理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
print(url)
print("=====================================================")
兩者結(jié)合起來就可以打印出所有產(chǎn)品鏈接
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 處理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類
catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
print("產(chǎn)品分類:" + catgory[0])
# 該分類下產(chǎn)品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 處理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
print(url)
print("=====================================================")
2.2使用xpath解析函數(shù)返回產(chǎn)品鏈接的內(nèi)容
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品名稱
title = tree.xpath('//*[@id="wrap"]//h1/text()')
images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
# 產(chǎn)品圖片
images_url = 'http://www.kexinjianji.com/' + images[0]
# 性能特點
xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
# 技術(shù)參數(shù)
jscs = tree.xpath('//table')[0]
jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
# 產(chǎn)品內(nèi)容
cpnr = tree.xpath('//div[@class="describe"]/p')
print('產(chǎn)品名稱:' + title[0])
print('產(chǎn)品圖片:' + images_url)
for td in xntd:
print('性能特點:' + td)
print('技術(shù)參數(shù):' + jscs_str)
for cp in cpnr:
# string(.) 獲取當前標簽下所有文本內(nèi)容
cp = cp.xpath('string(.)')
print('產(chǎn)品內(nèi)容:' + cp)
print('============================================')
將三者結(jié)合在一起就可以獲取所有產(chǎn)品信息
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 處理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類
catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
# 該分類下產(chǎn)品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 處理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
try:
tree = etree.HTML(content)
# 產(chǎn)品名稱
title = tree.xpath('//*[@id="wrap"]//h1/text()')
images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
# 產(chǎn)品圖片
images_url = 'http://www.kexinjianji.com' + images[0]
# 性能特點
xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
# 技術(shù)參數(shù)
jscs = tree.xpath('//table')[0]
jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
# 產(chǎn)品內(nèi)容
cpnr = tree.xpath('//div[@class="describe"]/p')
print("產(chǎn)品分類:" + catgory[0])
print('產(chǎn)品鏈接:' + url)
print('產(chǎn)品名稱:' + title[0])
print('產(chǎn)品圖片:' + images_url)
for td in xntd:
print('性能特點:' + td.strip())
# print('技術(shù)參數(shù):' + jscs_str)
for cp in cpnr:
# string(.) 獲取當前標簽下所有文本內(nèi)容
cp = cp.xpath('string(.)')
print('產(chǎn)品內(nèi)容:' + cp)
print('============================================')
except Exception as e:
print(e)
print('出錯url:' + url)
pass
3.存儲到django模型
import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
django.setup()
from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage
url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'
def get_one_page(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
res = requests.get(url=url, headers=headers, timeout=10)
res.encoding = 'utf-8'
if res.status_code == 200:
return res.text
else:
return None
except Exception:
print('aa')
return None
if __name__ == '__main__':
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類url
catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
# 處理catgory_urls
for url in catgory_urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
tree = etree.HTML(content)
# 產(chǎn)品分類
p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
# 該分類下產(chǎn)品url
urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
# 處理url
for url in urls:
url = 'http://www.kexinjianji.com' + url
content = get_one_page(url)
try:
tree = etree.HTML(content)
# 產(chǎn)品名稱
title = tree.xpath('//*[@id="wrap"]//h1/text()')
images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
# 產(chǎn)品圖片
images_url = 'http://www.kexinjianji.com' + images[0]
# 性能特點
xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
# 技術(shù)參數(shù)
jscs = tree.xpath('//table')[0]
jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
# 產(chǎn)品內(nèi)容
cpnr = tree.xpath('//div[@class="describe"]/p')
# 判斷是否有這分類,沒有則新建
catgory = p_catgory[0]
products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
if products_catgory:
products_catgory = ProductsCategory.objects.get(name=catgory)
else:
products_catgory = ProductsCategory(name=catgory)
products_catgory.save()
print(products_catgory)
# 保存產(chǎn)品圖片
image_content = requests.get(url=images_url)
ext = images_url.split('.')[-1] # 獲取圖片類型
filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # 隨機生成圖片名字
upload_image_file = ContentFile(image_content.content, name=filename) # 將圖片保存為django類型
product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
product.save()
for td in xntd:
product_advantage = ProductAdvantage()
product_advantage.content = td
product_advantage.product = product
product_advantage.save()
for cp in cpnr:
cp = cp.xpath('string(.)')
product_body = ProductBody()
product_body.body = cp
product_body.product = product
product_body.save()
except Exception as e:
print(e)
print('出錯url:' + url)
最后自己手動處理出錯url(頁面沒有獲取到技術(shù)參數(shù),技術(shù)參數(shù)是一張圖片)
4.總結(jié)
1.xpath 獲取標簽內(nèi)容時,p標簽中嵌套span標簽,源碼如下
div class="describe" style="position: relative;">
p>span>板 寬:/span>1500mm/p>
p>span>板 厚:/span>4.5 mm/p>
p>span>出料口:/span>6口/p>
p>span>重 量:/span>6000 kg/p>
/div>
使用xpath獲取p標簽內(nèi)容
我想得到的效果如下
板 寬:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分開獲取,不是想要的效果
//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()
百度之后找到的解決辦法,使用xpath(‘string(.)')
1.先獲取所有p標簽
cpnr = tree.xpath('//div[@class="describe"]/p')
2.使用**string(.)**獲取所有標簽所有文本
cp = cp.xpath('string(.)')
循環(huán)遍歷所有p標簽即可
到此這篇關(guān)于python利用xpath爬取網(wǎng)上數(shù)據(jù)并存儲到django模型中的文章就介紹到這了,更多相關(guān)xpath爬取網(wǎng)上數(shù)據(jù)存儲到django模型內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
您可能感興趣的文章:- django模型查詢操作的實現(xiàn)
- Django數(shù)據(jù)模型中on_delete使用詳解
- Django Admin后臺模型列表頁面如何添加自定義操作按鈕
- Django模型驗證器介紹與源碼分析
- Django3中的自定義用戶模型實例詳解
- Django CBV模型源碼運行流程詳解
- Python Django模型詳解