@@ -958,25 +958,8 @@ def down_file_by_url(self, share_url, pwd='', save_path='./Download', *, callbac
958958 if not resp :
959959 return LanZouCloud .FAILED
960960
961- content_length = resp .headers .get ('Content-Length' , None )
962- # 如果无法获取 Content-Length, 先读取一点数据, 再尝试获取一次
963- # 通常只需读取 1 字节数据
964- data_iter = resp .iter_content (chunk_size = 1 )
965- while not content_length :
966- logger .warning ("Not found Content-Length in response headers" )
967- logger .debug ("Read 1 byte from stream..." )
968- try :
969- next (data_iter )
970- except StopIteration :
971- logger .debug ("Please wait for a moment before downloading" )
972- return LanZouCloud .FAILED
973- resp_ = self ._get (info .durl , stream = True )
974- if not resp_ :
975- return LanZouCloud .FAILED
976- content_length = resp_ .headers .get ('Content-Length' , None )
977- logger .debug (f"Content-Length: { content_length } " )
978-
979- total_size = int (content_length )
961+ # 如果本地存在同名文件且设置了 overwrite, 则覆盖原文件
962+ # 否则修改下载文件路径, 自动在文件名后加序号
980963 file_path = save_path + os .sep + info .name
981964 if os .path .exists (file_path ):
982965 if overwrite :
@@ -989,9 +972,33 @@ def down_file_by_url(self, share_url, pwd='', save_path='./Download', *, callbac
989972 tmp_file_path = file_path + '.download' # 正在下载中的文件名
990973 logger .debug (f'Save file to { tmp_file_path } ' )
991974
975+ # 对于 txt 文件, 可能出现没有 Content-Length 的情况
976+ # 此时文件需要下载一次才会出现 Content-Length
977+ # 这时候我们先读取一点数据, 再尝试获取一次, 通常只需读取 1 字节数据
978+ content_length = resp .headers .get ('Content-Length' , None )
979+ if not content_length :
980+ data_iter = resp .iter_content (chunk_size = 1 )
981+ max_retries = 5 # 5 次拿不到就算了
982+ while not content_length and max_retries > 0 :
983+ max_retries -= 1
984+ logger .warning ("Not found Content-Length in response headers" )
985+ logger .debug ("Read 1 byte from stream..." )
986+ try :
987+ next (data_iter ) # 读取一个字节
988+ except StopIteration :
989+ logger .debug ("Please wait for a moment before downloading" )
990+ return LanZouCloud .FAILED
991+ resp_ = self ._get (info .durl , stream = True ) # 再请求一次试试
992+ if not resp_ :
993+ return LanZouCloud .FAILED
994+ content_length = resp_ .headers .get ('Content-Length' , None )
995+ logger .debug (f"Content-Length: { content_length } " )
996+
997+ if not content_length :
998+ return LanZouCloud .FAILED # 应该不会出现这种情况
999+
1000+ # 支持断点续传下载
9921001 now_size = 0
993- chunk_size = 4096
994- last_512_bytes = b'' # 用于识别文件是否携带真实文件名信息
9951002 if os .path .exists (tmp_file_path ):
9961003 now_size = os .path .getsize (tmp_file_path ) # 本地已经下载的文件大小
9971004 headers = {** self ._headers , 'Range' : 'bytes=%d-' % now_size }
@@ -1004,30 +1011,43 @@ def down_file_by_url(self, share_url, pwd='', save_path='./Download', *, callbac
10041011
10051012 with open (tmp_file_path , "ab" ) as f :
10061013 file_name = os .path .basename (file_path )
1007- for chunk in resp .iter_content (chunk_size ):
1014+ for chunk in resp .iter_content (4096 ):
10081015 if chunk :
10091016 f .write (chunk )
10101017 f .flush ()
10111018 now_size += len (chunk )
1012- if total_size - now_size < 512 :
1013- last_512_bytes += chunk
10141019 if callback is not None :
1015- callback (file_name , total_size , now_size )
1020+ callback (file_name , int (content_length ), now_size )
1021+
1022+ # 文件下载完成后, 检查文件尾部 512 字节数据
1023+ # 绕过官方限制上传时, API 会隐藏文件真实信息到文件尾部
1024+ # 这里尝试提取隐藏信息, 并截断文件尾部数据
10161025 os .rename (tmp_file_path , file_path ) # 下载完成,改回正常文件名
1017- # 尝试解析文件报尾
1018- file_info = un_serialize (last_512_bytes [- 512 :])
1019- if file_info is not None and 'padding' in file_info : # 大文件的记录文件也可以反序列化出 name,但是没有 padding
1020- real_name = file_info ['name' ] # 解除伪装的真实文件名
1021- logger .debug (f"Find meta info: real_name={ real_name } " )
1022- real_path = save_path + os .sep + real_name
1023- if overwrite and os .path .exists (real_path ):
1024- os .remove (real_path ) # 删除原文件
1025- new_file_path = auto_rename (real_path )
1026- os .rename (file_path , new_file_path )
1027- with open (new_file_path , 'rb+' ) as f :
1028- f .seek (- 512 , 2 ) # 截断最后 512 字节数据
1029- f .truncate ()
1030- file_path = new_file_path # 保存文件重命名后真实路径
1026+ if os .path .getsize (file_path ) > 512 : # 文件大于 512 bytes 就检查一下
1027+ file_info = None
1028+ with open (file_path , 'rb' ) as f :
1029+ f .seek (- 512 , os .SEEK_END )
1030+ last_512_bytes = f .read ()
1031+ file_info = un_serialize (last_512_bytes )
1032+
1033+ # 大文件的记录文件也可以反序列化出 name,但是没有 padding 字段
1034+ if file_info is not None and 'padding' in file_info :
1035+ real_name = file_info ['name' ] # 解除伪装的真实文件名
1036+ logger .debug (f"Find meta info: real_name={ real_name } " )
1037+ real_path = save_path + os .sep + real_name
1038+ # 如果存在同名文件且设置了 overwrite, 删掉原文件
1039+ if overwrite and os .path .exists (real_path ):
1040+ os .remove (real_path )
1041+ # 自动重命名, 文件存在就会加个序号
1042+ new_file_path = auto_rename (real_path )
1043+ os .rename (file_path , new_file_path )
1044+ # 截断最后 512 字节隐藏信息, 还原文件
1045+ with open (new_file_path , 'rb+' ) as f :
1046+ f .seek (- 512 , os .SEEK_END )
1047+ f .truncate ()
1048+ file_path = new_file_path # 保存文件重命名后真实路径
1049+
1050+ # 如果设置了下载完成的回调函数, 调用之
10311051 if downloaded_handler is not None :
10321052 downloaded_handler (os .path .abspath (file_path ))
10331053 return LanZouCloud .SUCCESS
@@ -1054,6 +1074,15 @@ def get_folder_info_by_url(self, share_url, dir_pwd='') -> FolderDetail:
10541074 # 要求输入密码, 用户描述中可能带有"输入密码",所以不用这个字符串判断
10551075 if ('id="pwdload"' in html or 'id="passwddiv"' in html ) and len (dir_pwd ) == 0 :
10561076 return FolderDetail (LanZouCloud .LACK_PASSWORD )
1077+
1078+ if "acw_sc__v2" in html :
1079+ # 在页面被过多访问或其他情况下,有时候会先返回一个加密的页面,其执行计算出一个acw_sc__v2后放入页面后再重新访问页面才能获得正常页面
1080+ # 若该页面进行了js加密,则进行解密,计算acw_sc__v2,并加入cookie
1081+ acw_sc__v2 = calc_acw_sc__v2 (html )
1082+ self ._session .cookies .set ("acw_sc__v2" , acw_sc__v2 )
1083+ logger .debug (f"Set Cookie: acw_sc__v2={ acw_sc__v2 } " )
1084+ html = self ._get (share_url ).text # 文件分享页面(第一页)
1085+
10571086 try :
10581087 # 获取文件需要的参数
10591088 html = remove_notes (html )
@@ -1144,6 +1173,7 @@ def _check_big_file(self, file_list):
11441173 logger .debug (f"Big file checking: Failed" )
11451174 return None
11461175 resp = self ._get (info .durl )
1176+ # 这里无需知道 txt 文件的 Content-Length, 全部读取即可
11471177 info = un_serialize (resp .content ) if resp else None
11481178 if info is not None : # 确认是大文件
11491179 name , size , * _ , parts = info .values () # 真实文件名, 文件字节大小, (其它数据),分段数据文件名(有序)
0 commit comments