一个HTML文件,找出里面的链接。
import requests, re
from bs4 import BeautifulSoup
def get_html_text(url):
try:
r = requests.get(url)
r.encoding = 'utf-8'
return r.text
except:
return ''
def get_urls(html, base_url):
soup = BeautifulSoup(html, 'html.parser')
urls = set()
for url in soup.find_all('a'):
# 可能 <a> 标签中无 href 属性
try:
url = url['href']
# 绝对路径 http://www.baidu.com/path/index.php?q=1
absolute_url = r'((http|https|ftp)://)?(\w+)(\.\w+)+'
# 相对路经 ./index.php
relative_url = r'\.?(/\w+)+/?'
absolute_url_pattern = re.compile(absolute_url, re.IGNORECASE))
relative_url_pattern = re.compile(relative_url)
if absolute_url_pattern.match(url):
urls.add(url)
continue
if relative_url_pattern.match(url):
url = base_url + url
urls.add(url)
continue
except:
continue
return urls
def show(urls):
for url in urls:
print(url)
if __name__ == '__main__':
url = 'http://zzzsdust.com'
html = get_html_text(url)
urls = get_urls(html, url)
show(urls)