pythonCopy codeimport requests
# 设置代理服务器
proxies = {
'http': 'http://user:password@proxy_ip:proxy_port',
'https': 'https://user:password@proxy_ip:proxy_port'
}
# 使用requests库向被封禁的网站发送请求
url = "https://www.blockedwebsite.com"
response = requests.get(url, proxies=proxies)
# 打印结果
print(response.text)
pythonCopy codeimport threading
import requests
# 定义一个函数来获取网页并将其保存到本地文件
def download(url, filename):
response = requests.get(url)
with open(filename, 'wb') as f:
f.write(response.content)
# 定义要抓取的网页列表
urls = ['https://www.example.com/page1', 'https://www.example.com/page2', 'https://www.example.com/page3']
# 使用多线程同时抓取多个网页
threads = []
for i, url in enumerate(urls):
thread = threading.Thread(target=download, args=(url, f'page{i+1}.html'))
threads.Append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print('All pages downloaded!')
pythonCopy codefrom selenium import webdriver
# 使用Firefox浏览器创建WebDriver对象
driver = webdriver.Firefox()
# 打开网站并登录
driver.get("https://www.example.com/login")
driver.find_element_by_id("username").send_keys("your_username")
driver.find_element_by_id("password").send_keys("your_password")
driver.find_element_by_id("login-button").click()
# 跳转到目标页面并获取数据
driver.get("https://www.example.com/target-page")
data = driver.find_element_by_xpath("//div[@class='data']").text
# 关闭浏览器
driver.quit()
# 打印结果
print(data)
pythonCopy codeimport scrapy
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
urls = [
'https://www.example.com/page1',
'https://www.example.com/page2',
'https://www.example.com/page3',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = MyItem()
item['title'] = response.xpath('//h1/text()').get()
item['body'] = response.xpath('//div[@class="body"]/text()')
yield item
其他的技术,脱离了实践,都是扯淡!