Python爬取wordpress

# /usr/bin/env python

# coding=utf8

”’python 爬取csdn 文章到 wordpress ”’

import requests

import re

import json

import time

from bs4 import BeautifulSoup

from lxml import etree

from wordpress_xmlrpc import Client, WordPressPost

from wordpress_xmlrpc.methods.posts import NewPost

from csdn import CSDN

import pymysql

class WordPress:

def __init__(self):

self.wp = Client(‘http://blog.zxb8.cc/xmlrpc.php’, ‘username’, ‘password’)

self.conn = pymysql.connect(host=’104.224.151.80′, port=3306, user=’xxx’, passwd=’xx’, db=’blog’,charset=’utf8′)

self.cursor = self.conn.cursor()

#通过 xmlrpc 方式导入数据到数据库

def sends(self,title,content):

# 链接 WordPress,输入 xmlrpc 链接,后台账号密码

post = WordPressPost()

post.title = title

# post.post_type=tag

post.content = content

post.post_status = ‘publish’

# 发送到 WordPress

# print ‘here3’

self.wp.call(NewPost(post))

time.sleep(3)

print(‘发布成功’)

#导入数据

def create(self,url):

print(url)

csdn = CSDN(url)

title = csdn.getTitle()

content = csdn.getContent()

img = csdn.getImg()

print(img)

if len(img) >0:

content += ” “.join(img)

#self.sends(title, content)

self.query(title,content,1)

time.sleep(3)

print(‘发布成功’)

#通过 pymysql 数据驱动导入数据库

#根据 mysql binlog 日志分析出需要插入和更新的表

def query(self,title,content,cat):

#替换

content = content.replace(“【工匠若水 http://blog.csdn.net/yanbober 未经允许严禁转载,请尊重作者劳动成果。私信联系我】”,””)

times = time.strftime(‘%Y-%m-%d %H:%M:%S’, time.localtime(time.time()))

#设置高亮显示

content = ‘

 ' + content +' 

#转义

content = pymysql.escape_string(content)

#插入 post

sql_post = “INSERT INTO wp_posts(post_author,post_date,post_content,post_title,post_excerpt,post_status,comment_status,ping_status,post_name,to_ping,pinged,post_modified,post_content_filtered,post_parent,menu_order,post_type,comment_count) VALUES (‘1′,’%s’,’%s’,’%s’,”,’publish’,’open’,’open’,’%s’,”,”,’%s’,”,’0′,’0′,’post’,’0′)” % (

str(times), str(content), str(title), str(title), str(times))

self.cursor.execute(sql_post)

new_id = self.cursor.lastrowid

#更新 guid

guid = “http://blog.zxb8.cc/?p={}”.format(new_id)

update_sql=”UPDATE `wp_posts` SET `guid` = ‘%s’ WHERE `ID` = %d” %(guid,new_id)

self.cursor.execute(update_sql)

#插入分类

sql_cat = “INSERT INTO wp_term_relationships(object_id,term_taxonomy_id,term_order) VALUES (%s,%s,’0′)” % (new_id, cat)

self.cursor.execute(sql_cat)

#提交

self.conn.commit()

# self.cursor.close()

# self.conn.close()

if __name__ == ‘__main__’:

headers={

“User-Agent”: “Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36”

}

wordpress = WordPress()

list_url = “https://blog.csdn.net/yanbober/article/category/6971209”

response = requests.get(list_url, headers=headers)

selector = etree.HTML(response.text)

# for url in selector.xpath(‘//li[@class=”blog-unit”]/a/@href’):

for url in selector.xpath(‘//div[@class=”article_title”]//a/@href’):

print(‘正在努力爬取中…’, url)

wordpress.create(url)

# i = 1

# while i<=5:

# url = “https://blog.csdn.net/mrlevo520/article/list/{}”.format(i)

# i=i+1

# response = requests.get(list_url,headers=headers)

# selector = etree.HTML(response.text)

# for url in selector.xpath(‘//li[@class=”blog-unit”]/a/@href’):

# print(‘正在努力爬取中…’,url)

# wordpress.create(url)

#/usr/bin/env python

# -*- coding:utf-8 -*-

#https://blog.csdn.net/MrLevo520/article/details/53158050

import requests

import json

import os

from lxml import etree

import time

import random

from datetime import *

class CSDN():

def __init__(self,url):

self.headers = {

“User-Agent”: “Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36”

}

self.url = url

self.selector = etree.HTML(self.getHtml())

#获取正文内容

def getHtml(self):

response = requests.get(self.url)

return response.text

#获取标题

def getTitle(self):

# title = self.selector.xpath(‘//h1[@class=”csdn_top”]/text()’)

title = self.selector.xpath(‘//span[@class=”link_title”]/a/text()’)

return title[0].strip()

#获取标签内容

def getTag(self):

tags = self.selector.xpath(‘//div[@id=”article_details”]/div[contains(@class,”article_manage”)]//div[@class=”article_l”]//a’)

taglist = []

for tag in tags:

taglist.append(tag.xpath(‘./text()’)[0])

return taglist

def getReadNum(self):

time = self.selector.xpath(‘//div[@id=”article_details”]/div[contains(@class,”article_manage”)]//div[@class=”article_r”]/span[1]/text()’)

read = self.selector.xpath(‘//div[@id=”article_details”]/div[contains(@class,”article_manage”)]//div[@class=”article_r”]/span[2]/text()’)

comment = self.selector.xpath(‘//div[@id=”article_details”]/div[contains(@class,”article_manage”)]//div[@class=”article_r”]/span[3]/text()’)

print(time)

print(read)

print(comment)

# 获取标签内容

def getContent(self):

content = self.selector.xpath(‘//div[@class=”markdown_views”]’)

#xpath 获取多个标签下的 text

return content[0].xpath(‘string(.)’).strip()

#下载图片

def getImg(self):

imgs = self.selector.xpath(‘//div[@class=”markdown_views”]//img/@src’)

if len(imgs) < 0:

return;

# print(imgs)

list_imgs = []

#创建文件保持目录

upload = os.getcwd() + “/upload”

if not os.path.exists(upload):

os.mkdir(upload)

#下载图片并保存

for img_url in imgs:

response = requests.get(img_url,headers=self.headers)

nowTime = datetime.now().strftime(“%Y%m%d%H%M%S”) # 生成当前的时间

randomNum = random.randint(0, 100) # 生成随机数 n,其中 0<=n<=100

if randomNum <= 10:

randomNum = str(0) + str(randomNum)

file_name = str(nowTime) + str(randomNum)+’.jpg’

save_name = upload + ‘/’ + file_name

print(‘download..’,save_name)

with open(save_name,’wb’) as f:

f.write(response.content)

#上传图片

remote_pic = self.upload(save_name)

if remote_pic:

img_src = ‘

list_imgs.append(img_src)

return list_imgs

#上传图片到图床,并返回图片地址

def upload(self,save_name):

url = ‘https://sm.ms/api/upload’

# 上传图片

files = {‘smfile’: open(save_name, ‘rb’)}

data = {‘ssl’: False, ‘format’: ‘json’}

response = requests.post(url, files=files, data=data)

result = response.text

# {‘code’: ‘success’, ‘data’: {‘path’: ‘/2018/04/19/5ad7fd2f7e60c.jpg’, ‘hash’: ‘vpw5S3armgducWz’,

# ‘url’: ‘https://i.loli.net/2018/04/19/5ad7fd2f7e60c.jpg’, ‘size’: 215024,

# ‘filename’: ‘20160213173754690.jpg’, ‘storename’: ‘5ad7fd2f7e60c.jpg’,

# ‘width’: 1366, ‘ip’: ‘124.207.180.37’, ‘timestamp’: 1524104495, ‘height’: 688,

#

# ‘delete’: ‘https://sm.ms/delete/vpw5S3armgducWz’}}

result = json.loads(result)

print(result)

if result.get(‘code’) == ‘success’:

return result[‘data’][‘url’]

此文由“快兔兔AI采集器”自动生成,目的为演示采集器效果,若侵权请及时联系删除。

原文链接:https://www.csdn.net/tags/NtDaIgysODExMjUtYmxvZwO0O0OO0O0O.html

更多内容