下载nginx搭建的文件服务器（爬虫）

windows版
需要下载python包：pip install requests

import requests
import re
import os

#开始访问的url地址，必须以/结尾
index_url = "https://www.aaa.com/aaaaa/"
#下载到本地的地址，必须以/结尾
local_address = "D:/up/"



def getHtml(index_url,local_address):
    resp = requests.get(index_url)
    html_content = resp.text
    # 使用re.DOTALL标志使.匹配包括换行符在内的任何字符  
    pattern = re.compile(r'<a\s+[^>]*?>(.*?)</a>', re.DOTALL)
    matches = pattern.findall(html_content)


    for match in matches:
        if("../"!=match):
            if("/"==match[-1]):
                #递归
                dg_url = index_url+match
                dg_local_address = local_address+match
                getHtml(dg_url,dg_local_address)
            else:
                hq_index_url = index_url+match
                hq_local_address = local_address+match
                print(hq_index_url+"=========="+hq_local_address)
                downFile(hq_index_url,hq_local_address)


# 获取内容，并下载
def downFile(url,local_address):
    # 创建目录
    # 分离目录和文件名  
    directory, filename = os.path.split(local_address)  
  
    # 检查目录是否存在，如果不存在则创建  
    if not os.path.exists(directory):  
        os.makedirs(directory)  
    
    response = requests.get(url, stream=True)  # 使用stream=True以节省内存
    # 检查响应状态码  
    if response.status_code == 200:  
        # 打开文件以二进制写入模式  
        with open(local_address, 'wb') as f:  
            # 迭代响应内容  
            for chunk in response.iter_content(chunk_size=8192):  
                # 如果chunk存在，则写入文件  
                if chunk:  
                    f.write(chunk)  
    else:  
        print("下载出错:"+url)


getHtml(index_url,local_address)