淘淘商城学习资源目录

淘淘商城学习资源目录

目录

爬取代码

from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import cn2an
ua = UserAgent()
urls = ["https://blog.csdn.net/u012453843/category_6970308.html",
"https://blog.csdn.net/u012453843/category_6970308_2.html",
"https://blog.csdn.net/u012453843/category_6970308_3.html"]
headers={"User-Agent":ua.random}
data = []
# 解析数据
for url in urls:
    r=requests.get(url=url,headers=headers)
    bs = BeautifulSoup(r.text, "html.parser")
    bs = bs.find_all("ul",class_="column_article_list")[0]
    bs = bs.find_all("li")
    for i in bs:
        kv = {}
        title = i.a.find_all("div",class_="column_article_title")[0].h2.text.replace(" ","").replace("原创\n","").replace("\n","").replace("学习","")
        href = i.a["href"]
        kv["title"] = title
        kv["href"] = href
        data.append(kv)
        # print("["+title+"]("+href+")")
# 数据整理
for i in data:
    title = i["title"]
    # print(title)
    startIndex = title.find("第")
    sendIndex = title.find("课(")
    if startIndex == -1:
        i["index"]=999
    else:
        i["index"]=cn2an.cn2an(title[startIndex+1:sendIndex], "normal")
# 数据排序
data.sort(key=lambda x: x['index'], reverse=False)
# 数据输出
for i in data:
    print("* [%s](%s)" % (i["title"],i["href"]))

参考资料