python爬虫

# python爬虫静态网页的某个元素

import requests
from bs4 import BeautifulSoup
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
}
resp = requests.get("http://localhost:8000/page.html", headers=headers)
html=resp.content.decode()
soup = BeautifulSoup(html, 'html.parser')
divs = soup.find_all('div', {'class': 'zz'})
for div in divs:
    print(div.p.text)

1
2
3
4
5
6
7
8
9
10
11

# python爬取视频网站的标题

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
# 初始化WebDriver
driver = webdriver.Chrome()  # 假设使用Chrome

# 打开网页
driver.get("https://www.dmla7.com/type/guochandongman.html")

html = driver.page_source

soup = BeautifulSoup(html, 'html.parser')
li_tags = soup.find_all('li', {'class': 'col-md-6 col-sm-4 col-xs-3'})
titles = []
for li_tag in li_tags:
    a_tag = li_tag.find('a', class_='stui-vodlist__thumb lazyload')
    if a_tag and 'title' in a_tag.attrs:
        titles.append(a_tag['title'])
# 输出结果
for title in titles:
    print(title)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

# 使用python在搜索引擎输入内容，并获取排名前十的标题

import requests
from bs4 import BeautifulSoup

# 伪造请求头，模拟浏览器
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 百度搜索URL
url = "https://www.baidu.com/s"
params = {"wd": "python基础知识"}

# 发起请求
response = requests.get(url, headers=headers, params=params)

# 检查请求是否成功
if response.status_code == 200:
    # 解析HTML
    soup = BeautifulSoup(response.text, "html.parser")
    
    # 获取前10个搜索结果标题
    titles = soup.find_all("h3")[:10]
    
    for i, title in enumerate(titles, 1):
        print(f"{i}. {title.get_text()}")
else:
    print("请求失败，状态码：", response.status_code)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# 爬取网易云音乐原创版歌曲

import requests
from bs4 import BeautifulSoup
import os
import time
import re

def get_song_list(url, limit=3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Referer": "https://music.163.com/"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    song_list = []
    for song in soup.select('ul.f-hide li a')[:limit]:
        song_id = song['href'].split('=')[-1]
        song_name = song.text
        song_list.append((song_id, song_name))
    
    return song_list

def get_song_url(song_id):
    url = f"http://music.163.com/song/media/outer/url?id={song_id}.mp3"
    return url

def download_song(song_url, song_name):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    }
    response = requests.get(song_url, headers=headers, stream=True)
    
    if response.status_code == 200:
        # 创建 downloads 文件夹（如果不存在）
        if not os.path.exists("downloads"):
            os.makedirs("downloads")
        
        # 清理文件名
        song_name = re.sub(r'[\\/*?:"<>|]', "", song_name)
        
        # 保存文件
        file_path = os.path.join("downloads", f"{song_name}.mp3")
        print(file_path)
        # with open(file_path, "wb") as f:
        #     for chunk in response.iter_content(chunk_size=1024):
        #         if chunk:
        #             f.write(chunk)
        # print(f"下载完成: {song_name}")
    else:
        print(f"下载失败: {song_name}")

def main():
    url = "https://music.163.com/discover/toplist?id=2884035"
    song_list = get_song_list(url, limit=3)
    
    for song_id, song_name in song_list:
        song_url = get_song_url(song_id)
        print(f"正在下载: {song_name}")
        download_song(song_url, song_name)
        time.sleep(1)  # 添加延时，避免被封IP

if __name__ == "__main__":
    main()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

上次更新: 2024/08/28, 21:24:55

← pip更换国内源 python环境启动服务报错缺少glibc库版本→