import requests from lxml import etree import pymysql import json
headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9',}
url = "http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=10&page=1&size=10"
通过数据接口形式我们看到它有两个参数page和size,page=1表示第一页,size=10表示每页十条数据。我们只需要改变page的数据就可以源源不断的获取数据了,当然获取数据要节制,不要给人家服务器造成太大压力。
json_str = requests.get(url,headers= headers).text
[{ "id": 357806424, "authorId": 557006, "authorName": "FX168", "authorPic": "//sucimg.itc.cn/avatarimg/34ca41ae9ad04be68072f8894d7124b7_1491542550231", "focus": "//5b0988e595225.cdn.sohucs.com/c_fill,w_600,h_300,g_faces/images/20191202/cc5eda06fba94b3fb7ed0c1c2faea9a6.jpeg", "picUrl": "//5b0988e595225.cdn.sohucs.com/c_fill,w_150,h_100,g_faces,q_70/images/20191202/cc5eda06fba94b3fb7ed0c1c2faea9a6.jpeg", "images": ["//5b0988e595225.cdn.sohucs.com/c_fill,w_150,h_100,g_faces,q_70/images/20191202/cc5eda06fba94b3fb7ed0c1c2faea9a6.jpeg", "//5b0988e595225.cdn.sohucs.com/c_fill,w_150,h_100,g_faces,q_70/images/20191202/df1ed938d9614cf690f87a58577ce07a.png"], "title": "70年来首次,美国成石油净出口国!国际油价暴跌近5%,一切才刚刚开始?", "mobileTitle": "70年来首次,美国成石油净出口国!国际油价暴跌近5%,一切才刚刚开始?", "tags": [{ "id": 70694, "name": "沙特", "channelId": 0, "channelName": null, "categoryId": 0, "categoryName": null, "config": null, "introduction": null, "secureScore": 100, "hotSpot": false }, { "id": 68937, "name": "美国", "channelId": 0, "channelName": null, "categoryId": 0, "categoryName": null, "config": null, "introduction": null, "secureScore": 100, "hotSpot": false }, { "id": 68938, "name": "俄罗斯", "channelId": 0, "channelName": null, "categoryId": 0, "categoryName": null, "config": null, "introduction": null, "secureScore": 100, "hotSpot": false }], "publicTime": 1575262702000, "channelId": 0, "channelName": null, "channelUrl": "", "categoryId": 0, "categoryName": null, "headImage": null, "cmsId": 0, "originalSource": "http://mp.weixin.qq.com/s?__biz=MjM5OTAwOTMyMA==&mid=2650280772&idx=1&sn=85dd7f58ab6b292fcff2d57a677a35dc", "outerLink": false, "otherId": 0, "passport": "fx168caijing@sohu.com", "personalPage": "http://mp.sohu.com/profile?xpt=ZngxNjhjYWlqaW5nQHNvaHUuY29t", "videoInfo": null, "type": 0, "cover": null, "tkd": null, "secureScore": 100 }]
http://www.sohu.com/a/357806424_557006
html = requests.get(url,headers= headers).text #获取内容 etree.HTML(str(html)).xpath(“”//article[@class = 'article']//p//text()')
sql = " sql_insert = 'insert into information (`type`, url,author,title,content,postTime,addtime,`unique`) values (%s,%s,%s,%s,%s,%s,%s,%s)' "
conn = pymysql.connect(host=“xxxxxxx”, port=3306, user="xxxx", passwd="xxxxxx", db="news", charset="utf8") cursor = conn.cursor() cursor.execute(sql_insert)
完整代码请转我的csdn连接!