python爬虫
# python爬虫静态网页的某个元素
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
}
resp = requests.get("http://localhost:8000/page.html", headers=headers)
html=resp.content.decode()
soup = BeautifulSoup(html, 'html.parser')
divs = soup.find_all('div', {'class': 'zz'})
for div in divs:
print(div.p.text)
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# python爬取视频网站的标题
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
# 初始化WebDriver
driver = webdriver.Chrome() # 假设使用Chrome
# 打开网页
driver.get("https://www.dmla7.com/type/guochandongman.html")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
li_tags = soup.find_all('li', {'class': 'col-md-6 col-sm-4 col-xs-3'})
titles = []
for li_tag in li_tags:
a_tag = li_tag.find('a', class_='stui-vodlist__thumb lazyload')
if a_tag and 'title' in a_tag.attrs:
titles.append(a_tag['title'])
# 输出结果
for title in titles:
print(title)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 使用python在搜索引擎输入内容,并获取排名前十的标题
import requests
from bs4 import BeautifulSoup
# 伪造请求头,模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# 百度搜索URL
url = "https://www.baidu.com/s"
params = {"wd": "python基础知识"}
# 发起请求
response = requests.get(url, headers=headers, params=params)
# 检查请求是否成功
if response.status_code == 200:
# 解析HTML
soup = BeautifulSoup(response.text, "html.parser")
# 获取前10个搜索结果标题
titles = soup.find_all("h3")[:10]
for i, title in enumerate(titles, 1):
print(f"{i}. {title.get_text()}")
else:
print("请求失败,状态码:", response.status_code)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 爬取网易云音乐原创版歌曲
import requests
from bs4 import BeautifulSoup
import os
import time
import re
def get_song_list(url, limit=3):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": "https://music.163.com/"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
song_list = []
for song in soup.select('ul.f-hide li a')[:limit]:
song_id = song['href'].split('=')[-1]
song_name = song.text
song_list.append((song_id, song_name))
return song_list
def get_song_url(song_id):
url = f"http://music.163.com/song/media/outer/url?id={song_id}.mp3"
return url
def download_song(song_url, song_name):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
response = requests.get(song_url, headers=headers, stream=True)
if response.status_code == 200:
# 创建 downloads 文件夹(如果不存在)
if not os.path.exists("downloads"):
os.makedirs("downloads")
# 清理文件名
song_name = re.sub(r'[\\/*?:"<>|]', "", song_name)
# 保存文件
file_path = os.path.join("downloads", f"{song_name}.mp3")
print(file_path)
# with open(file_path, "wb") as f:
# for chunk in response.iter_content(chunk_size=1024):
# if chunk:
# f.write(chunk)
# print(f"下载完成: {song_name}")
else:
print(f"下载失败: {song_name}")
def main():
url = "https://music.163.com/discover/toplist?id=2884035"
song_list = get_song_list(url, limit=3)
for song_id, song_name in song_list:
song_url = get_song_url(song_id)
print(f"正在下载: {song_name}")
download_song(song_url, song_name)
time.sleep(1) # 添加延时,避免被封IP
if __name__ == "__main__":
main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
上次更新: 2024/08/28, 21:24:55