forked from aploium/zmirror
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zmirror.py
2232 lines (1882 loc) · 94.8 KB
/
zmirror.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# coding=utf-8
import os
# noinspection PyUnresolvedReferences
from itertools import count
if os.path.dirname(__file__) != '':
os.chdir(os.path.dirname(__file__))
import traceback
import pickle
from datetime import datetime, timedelta
import re
import base64
import zlib
import random
import sched
import copy
from time import time, sleep
import queue
from fnmatch import fnmatch
from html import escape as html_escape
from urllib.parse import urljoin, urlsplit, urlunsplit, quote_plus
import requests
from flask import Flask, request, make_response, Response, redirect
from ColorfulPyPrint import * # TODO: Migrate logging tools to the stdlib
__VERSION__ = '0.23.0-dev'
__author__ = 'Aploium <i@z.codes>'
infoprint('zmirror version: ', __VERSION__, 'from', __author__)
infoprint('Github: https://github.com/Aploium/zmirror')
try:
import threading
except ImportError: # 在某些罕见的系统环境下,threading包可能失效,用dummy代替
import dummy_threading as threading
try: # 用于检测html的文本编码, cchardet是chardet的c语言实现, 非常快
from cchardet import detect as c_chardet
except:
cchardet_available = False
else:
cchardet_available = True
try: # lru_cache的c语言实现, 比Python内置lru_cache更快
from fastcache import lru_cache # lru_cache用于缓存函数的执行结果
except:
from functools import lru_cache
warnprint('package fastcache not found, fallback to stdlib lru_cache, no FUNCTION is effected, only maybe a bit slower. '
'Considering install it using "pip3 install fastcache"')
else:
infoprint('lru_cache loaded successfully from fastcache')
try: # 加载默认设置
from config_default import *
except:
warnprint('the config_default.py is missing, this program may not works normally\n'
'config_default.py 文件丢失, 这会导致配置文件不向后兼容, 请重新下载一份 config_default.py')
try: # 加载用户自定义配置文件, 覆盖掉默认配置的同名项
from config import *
except:
warnprint(
'the config_default.py is missing, fallback to default configs(if we can), '
'please COPY the config_default.py to config.py, and change it\'s content, '
'or use the configs in the more_configs folder\n'
'自定义配置文件 config.py 丢失, 将使用默认设置, 请将 config_default.py 复制一份为 config.py, '
'并根据自己的需求修改里面的设置'
'(或者使用 more_configs 中的配置文件)'
)
else:
infoprint('config file found')
if local_cache_enable:
try:
from cache_system import FileCache, get_expire_from_mime
cache = FileCache()
except Exception as e:
errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.')
local_cache_enable = False
else:
infoprint('Local file cache enabled')
# ########## Basic Init #############
# 开始从配置文件加载配置, 在读代码时可以先跳过这部分, 从 main_function() 开始看
ColorfulPyPrint_set_verbose_level(verbose_level)
my_host_name_no_port = my_host_name # 不带有端口号的本机域名
if my_host_port is not None:
my_host_name += ':' + str(my_host_port) # 带有端口号的本机域名, 如果为标准端口则不带显式端口号
my_host_name_urlencoded = quote_plus(my_host_name) # url编码后的
else:
my_host_name_urlencoded = my_host_name
static_file_extensions_list = set(static_file_extensions_list)
external_domains_set = set(external_domains or [])
allowed_domains_set = external_domains_set.copy()
allowed_domains_set.add(target_domain)
for _domain in external_domains: # for support domain with port
allowed_domains_set.add(urlsplit('http://' + _domain).hostname)
domain_alias_to_target_set = set() # 那些被视为主域名的域名, 如 www.google.com和google.com可以都被视为主域名
domain_alias_to_target_set.add(target_domain)
domains_alias_to_target_domain = list(domains_alias_to_target_domain)
if domains_alias_to_target_domain:
for _domain in domains_alias_to_target_domain:
allowed_domains_set.add(_domain)
domain_alias_to_target_set.add(_domain)
domains_alias_to_target_domain.append(target_domain)
else:
domains_alias_to_target_domain = [target_domain]
my_host_scheme_escaped = my_host_scheme.replace('/', r'\/')
myurl_prefix = my_host_scheme + my_host_name # http(s)://www.my-mirror-site.com 末尾没有反斜线
myurl_prefix_escaped = myurl_prefix.replace('/', r'\/')
cdn_domains_number = len(CDN_domains)
allowed_remote_response_headers = {
'content-type', 'date', 'expires', 'cache-control', 'last-modified', 'server', 'location',
'accept-ranges',
'access-control-allow-origin', 'access-control-allow-headers', 'access-control-allow-methods',
'access-control-expose-headers', 'access-control-max-age', 'access-control-allow-credentials',
'timing-allow-origin',
}
allowed_remote_response_headers.update(custom_allowed_remote_headers)
# ## Get Target Domain and MyHostName's Root Domain ##
# 解析目标域名和本机域名的根域名, 如 www.foobar.com 的根域名为 foobar.com
# 但是 www.aaa.foobar.com 的根域名会被认为是 aaa.foobar.com
# 支持二级顶级域名, 如 www.white.ac.cn
temp = target_domain.split('.')
if len(temp) <= 2 or len(temp) == 3 and temp[1] in ('com', 'net', 'org', 'co', 'edu', 'mil', 'gov', 'ac'):
target_domain_root = target_domain
else:
target_domain_root = '.'.join(temp[1:])
temp = my_host_name.split('.')
if len(temp) <= 2 or len(temp) == 3 and temp[1] in ('com', 'net', 'org', 'co', 'edu', 'mil', 'gov', 'ac'):
my_host_name_root = target_domain
else:
my_host_name_root = '.'.join(temp[1:])
# keep-alive的连接池, 每个域名保持一个keep-alive连接
# 借用requests在同一session中, 自动保持keep-alive的特性
connection_pool_per_domain = {}
if enable_keep_alive_per_domain:
for _domain in allowed_domains_set:
connection_pool_per_domain[_domain] = {'session': requests.Session(), }
# 在 cdn_redirect_encode_query_str_into_url 中用于标示编码进url的分隔串
cdn_url_query_encode_salt = 'zm24'
_url_salt = re.escape(cdn_url_query_encode_salt)
# ## thread local var ##
# 与flask的request变量功能类似, 存储了一些解析后的请求信息, 在程序中会经常被调用
this_request = threading.local()
this_request.start_time = None # 处理请求开始的时间, unix
this_request.content_type = '' # 远程服务器响应头中的content_type
this_request.mime = '' # 远程服务器响应的MIME
this_request.cache_control = '' # 远程服务器响应的cache_control内容
this_request.temporary_domain_alias = None # 用于纯文本域名替换, 见 `plain_replace_domain_alias` 选项
this_request.remote_domain = '' # 当前请求对应的远程域名
this_request.is_https = '' # 是否需要用https来请求远程域名
this_request.remote_url = '' # 远程服务器的url
this_request.remote_path = '' # 对应的远程path
this_request.remote_path_query = '' # 对应的远程path+query string
this_request.remote_response = None # 远程服务器的响应, requests.Response
# task_scheduler
task_scheduler = sched.scheduler(time, sleep)
# ########## Handle dependencies #############
if not enable_static_resource_CDN:
mime_based_static_resource_CDN = False
disable_legacy_file_recognize_method = True
if not mime_based_static_resource_CDN:
cdn_redirect_code_if_cannot_hard_rewrite = 0 # record incoming urls if we should use cdn on it
url_to_use_cdn = {}
if not cdn_redirect_code_if_cannot_hard_rewrite:
cdn_redirect_encode_query_str_into_url = False
if not isinstance(target_static_domains, set):
target_static_domains = set()
if not enable_stream_content_transfer:
steamed_mime_keywords = ()
if not url_custom_redirect_enable:
url_custom_redirect_list = {}
url_custom_redirect_regex = ()
shadow_url_redirect_regex = ()
plain_replace_domain_alias = ()
if not enable_stream_content_transfer:
enable_stream_transfer_async_preload = False
if not enable_automatic_domains_whitelist:
domains_whitelist_auto_add_glob_list = tuple()
if not enable_individual_sites_isolation:
isolated_domains = set()
else:
for isolated_domain in isolated_domains:
if isolated_domain not in external_domains_set:
warnprint('An isolated domain:', isolated_domain,
'would not have effect because it did not appears in the `external_domains` list')
if enable_custom_access_cookie_generate_and_verify:
human_ip_verification_whitelist_from_cookies = False
if not is_use_proxy:
requests_proxies = None
if human_ip_verification_enabled:
import ipaddress
buff = []
for network in human_ip_verification_default_whitelist_networks:
buff.append(ipaddress.ip_network(network, strict=False))
human_ip_verification_default_whitelist_networks = tuple(buff)
for question in human_ip_verification_questions:
human_ip_verification_answers_hash_str += question[1]
else:
identity_verify_required = False
human_ip_verification_whitelist_from_cookies = False
must_verify_cookies = False
if not human_ip_verification_whitelist_from_cookies and not enable_custom_access_cookie_generate_and_verify:
must_verify_cookies = False
url_rewrite_cache = {} # an VERY Stupid and VERY Experimental Cache
url_rewrite_cache_hit_count = 0
url_rewrite_cache_miss_count = 0
# ########### PreCompile Regex ###############
# Advanced url rewriter, see function response_text_rewrite()
# #### 这个正则表达式是整个程序的最核心的部分, 它的作用是从 html/css/js 中提取出长得类似于url的东西 ####
# 如果需要阅读这个表达式, 请一定要在IDE(如PyCharm)的正则高亮下阅读
# 这个正则并不保证匹配到的东西一定是url, 在 regex_url_reassemble() 中会进行进一步验证是否是url
regex_adv_url_rewriter = re.compile( # TODO: Add non-standard port support
# 前缀, 必须有 'action='(表单) 'href='(链接) 'src=' 'url('(css) '@import'(css) '":'(js/json, "key":"value")
# \s 表示空白字符,如空格tab
r"""(?P<prefix>\b((action|href|src)\s*=|url\s*\(|@import\s*|"\s*:)\s*)""" + # prefix, eg: src=
# 左边引号, 可选 (因为url()允许没有引号). 如果是url以外的, 必须有引号且左右相等(在重写函数中判断, 写在正则里可读性太差)
r"""(?P<quote_left>["'])?""" + # quote "'
# 域名和协议头, 可选. http:// https:// // http:\/\/ (json) https:\/\/ (json) \/\/ (json)
r"""(?P<domain_and_scheme>(?P<scheme>(https?:)?\\?/\\?/)(?P<domain>([-a-z0-9]+\.)+[a-z]+(?P<port>:\d{1,5})?))?""" +
# url路径, 含参数 可选
r"""(?P<path>[^\s;+$?#'"\{}]*?""" + # full path(with query string) /foo/bar.js?love=luciaZ
# url中的扩展名, 仅在启用传统的根据扩展名匹配静态文件时打开
(r"""(\.(?P<ext>[-_a-z0-9]+?))?""" if not disable_legacy_file_recognize_method else '') + # file ext
# 查询字符串, 可选
r"""(?P<query_string>\?[^\s?#'"]*?)?)""" + # query string ?love=luciaZ
# 右引号(可以是右括弧), 必须
r"""(?P<quote_right>["'\)])(?P<right_suffix>\W)""", # right quote "'
flags=re.IGNORECASE
)
regex_extract_base64_from_embedded_url = re.compile(
r'_' + _url_salt + r'(?P<gzip>z?)_\.(?P<b64>[a-zA-Z0-9-_]+=*)\._' + _url_salt + r'_\.[a-zA-Z\d]+\b')
# Response Cookies Rewriter, see response_cookie_rewrite()
regex_cookie_rewriter = re.compile(r'\bdomain=(\.?([\w-]+\.)+\w+)\b', flags=re.IGNORECASE)
regex_cookie_path_rewriter = re.compile(r'(?P<prefix>[pP]ath)=(?P<path>[\w\._/-]+?;)')
# Request Domains Rewriter, see client_requests_text_rewrite()
if my_host_port is not None:
temp = r'(' + re.escape(my_host_name) + r'|' + re.escape(my_host_name_no_port) + r')'
else:
temp = re.escape(my_host_name)
regex_request_rewriter = re.compile(
temp + r'(/|(%2F))extdomains(/|(%2F))(https-)?(?P<origin_domain>\.?([\w-]+\.)+\w+)\b',
flags=re.IGNORECASE)
# Flask main app
app = Flask(__name__)
# ########## Begin Utils #############
def cache_clean(is_force_flush=False):
"""
清理程序运行中产生的垃圾, 在程序运行期间会被自动定期调用
包括各种重写缓存, 文件缓存等
默认仅清理过期的
:param is_force_flush: 是否无视有效期, 清理所有缓存
"""
global url_rewrite_cache, cache, url_to_use_cdn, connection_pool_per_domain
if len(url_rewrite_cache) > 16384:
url_rewrite_cache.clear()
if len(url_to_use_cdn) > 40960:
url_to_use_cdn.clear()
if enable_keep_alive_per_domain:
connection_pool_per_domain.clear()
try:
if local_cache_enable:
cache.check_all_expire(force_flush_all=is_force_flush)
except:
errprint('ErrorWhenCleaningLocalCache, is_force_flush=', is_force_flush)
traceback.print_exc()
if is_force_flush:
try:
is_domain_match_glob_whitelist.cache_clear()
is_content_type_streamed.cache_clear()
extract_real_url_from_embedded_url.cache_clear()
embed_real_url_to_embedded_url.cache_clear()
check_global_ua_pass.cache_clear()
is_mime_represents_text.cache_clear()
extract_mime_from_content_type.cache_clear()
is_content_type_using_cdn.cache_clear()
is_ua_in_whitelist.cache_clear()
verify_ip_hash_cookie.cache_clear()
is_denied_because_of_spider.cache_clear()
is_ip_not_in_allow_range.cache_clear()
# client_requests_text_rewrite.cache_clear()
# extract_url_path_and_query.cache_clear()
except:
errprint('ErrorWhenCleaningFunctionLruCache')
traceback.print_exc()
def cron_task_container(task_dict, add_task_only=False):
"""
定时任务容器. 调用目标函数, 并在运行结束后创建下一次定时
:param task_dict: 定时任务的相关参数, dict
{ "target":目标函数(可调用的函数对象,不是函数名字符串) 必须,
"iterval":任务延时(秒) 可选,
"priority":优先级 可选,
"name":定时任务别名 可选
"args":位置型参数 (arg1,arg2) 可选,
"kwargs":键值型参数 {key:value,} 可选,
}
:param add_task_only: 是否只添加定时任务而不执行
"""
global task_scheduler
if not add_task_only:
# 执行任务
try:
infoprint('CronTask:', task_dict.get('name', str(task_dict['target'])), 'Target:', str(task_dict['target']))
target_func = task_dict.get('target')
if target_func is None:
raise ValueError("target is not given in " + str(task_dict))
target_func(
*(task_dict.get('args', ())), # 解开参数以后传递
**(task_dict.get('kwargs', {}))
)
except:
errprint('ErrorWhenProcessingCronTasks', task_dict)
traceback.print_exc()
# 添加下一次定时任务
task_scheduler.enter(
task_dict.get('interval', 300),
task_dict.get('priority', 999),
cron_task_container,
(task_dict,)
)
def cron_task_host():
"""定时任务宿主, 每分钟检查一次列表, 运行时间到了的定时任务"""
while True:
sleep(60)
try:
task_scheduler.run()
except:
errprint('ErrorDuringExecutingCronTasks')
traceback.print_exc()
# noinspection PyShadowingNames
def calc_domain_replace_prefix(_domain):
"""生成各种形式的scheme变体"""
return dict(
# normal
slash='//' + _domain,
http='http://' + _domain,
https='https://' + _domain,
double_quoted='"%s"' % _domain,
single_quoted="'%s'" % _domain,
# hex
hex_lower=('//' + _domain).replace('/', r'\x2f'),
hex_upper=('//' + _domain).replace('/', r'\x2F'),
# escape slash
slash_esc=('//' + _domain).replace('/', r'\/'),
http_esc=('http://' + _domain).replace('/', r'\/'),
https_esc=('https://' + _domain).replace('/', r'\/'),
double_quoted_esc='\\"%s\\"' % _domain,
single_quoted_esc="\\'%s\\'" % _domain,
# double escape slash
slash_double_esc=('//' + _domain).replace('/', r'\\\/'),
http_double_esc=('http://' + _domain).replace('/', r'\\\/'),
https_double_esc=('https://' + _domain).replace('/', r'\\\/'),
# urlencoded
slash_ue=quote_plus('//' + _domain),
http_ue=quote_plus('http://' + _domain),
https_ue=quote_plus('https://' + _domain),
double_quoted_ue=quote_plus('"%s"' % _domain),
single_quoted_ue=quote_plus("'%s'" % _domain),
# escaped and urlencoded
slash_esc_ue=quote_plus(('//' + _domain).replace('/', r'\/')),
http_esc_ue=quote_plus(('http://' + _domain).replace('/', r'\/')),
https_esc_ue=quote_plus(('https://' + _domain).replace('/', r'\/')),
)
def add_temporary_domain_alias(source_domain, replaced_to_domain):
"""
添加临时域名替换列表
用于纯文本域名替换, 见 `plain_replace_domain_alias` 选项
:param source_domain: 被替换的域名
:param replaced_to_domain: 替换成这个域名
"""
if this_request.temporary_domain_alias is None:
this_request.temporary_domain_alias = []
else:
this_request.temporary_domain_alias = list(this_request.temporary_domain_alias)
this_request.temporary_domain_alias.append((source_domain, replaced_to_domain))
this_request.temporary_domain_alias = tuple(this_request.temporary_domain_alias)
dbgprint('A domain', source_domain, 'to', replaced_to_domain, 'added to temporary_domain_alias',
this_request.temporary_domain_alias)
@lru_cache(maxsize=1024)
def is_domain_match_glob_whitelist(domain):
"""
域名是否匹配 `domains_whitelist_auto_add_glob_list` 中设置的通配符
"""
for domain_glob in domains_whitelist_auto_add_glob_list:
if fnmatch(domain, domain_glob):
return True
return False
@lru_cache(maxsize=128)
def is_content_type_streamed(_content_type):
"""
根据content-type判断是否应该用stream模式传输(服务器下载的同时发送给用户)
视频/音频/图片等二进制内容默认用stream模式传输
"""
for streamed_keyword in steamed_mime_keywords:
if streamed_keyword in _content_type:
return True
return False
# noinspection PyGlobalUndefined
def try_match_and_add_domain_to_rewrite_white_list(domain, force_add=False):
"""
若域名与`domains_whitelist_auto_add_glob_list`中的通配符匹配, 则加入 external_domains 列表
被加入 external_domains 列表的域名, 会被应用重写机制
用于在程序运行过程中动态添加域名到external_domains中
也可在外部函数(custom_func.py)中使用
关于 external_domains 更详细的说明, 请看 default_config.py 中对应的文档
"""
global external_domains, external_domains_set, allowed_domains_set, prefix_buff
if domain is None or not domain:
return False
if domain in allowed_domains_set:
return True
if not force_add and not is_domain_match_glob_whitelist(domain):
return False
else:
infoprint('A domain:', domain, 'was added to external_domains list')
_buff = list(external_domains) # external_domains是tuple类型, 添加前需要先转换
_buff.append(domain)
external_domains = tuple(_buff) # 转换回tuple, tuple有一些性能优势
external_domains_set.add(domain)
allowed_domains_set.add(domain)
prefix_buff[domain] = calc_domain_replace_prefix(domain)
# write log
try:
with open('automatic_domains_whitelist.log', 'a', encoding='utf-8') as fp:
fp.write(domain + '\n')
except:
traceback.print_exc()
return True
def current_line_number():
"""Returns the current line number in our program."""
import inspect
return inspect.currentframe().f_back.f_lineno
@lru_cache(maxsize=1024)
def extract_real_url_from_embedded_url(embedded_url):
"""
将 embed_real_url_to_embedded_url() 编码后的url转换为原来的带有参数的url
`cdn_redirect_encode_query_str_into_url`设置依赖于本函数, 详细说明请看配置文件中这个参数的部分
eg: https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css
---> https://foo.com/a.php?q=something (assume it returns an css) (base64 only)
eg2: https://cdn.domain.com/a/b/_zm24_.bG92ZT1saXZl._zm24_.jpg
---> https://foo.com/a/b/?love=live (assume it returns an jpg) (base64 only)
eg3: https://cdn.domain.com/a/b/_zm24z_.[some long long base64 encoded string]._zm24_.jpg
---> https://foo.com/a/b/?love=live[and a long long query string] (assume it returns an jpg) (gzip + base64)
eg4:https://cdn.domain.com/a (no change)
---> (no query string): https://foo.com/a (assume it returns an png) (no change)
:param embedded_url: embedded_url
:return: real url or None
"""
if '._' + cdn_url_query_encode_salt + '_.' not in embedded_url[-15:]: # check url mark
return None
m = regex_extract_base64_from_embedded_url.search(embedded_url)
b64 = get_group('b64', m)
if not b64:
return None
# 'https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css'
# real_request_url_no_query ---> 'https://cdn.domain.com/a.php'
real_request_url_no_query = embedded_url[:m.span()[0]]
try:
query_string_byte = base64.urlsafe_b64decode(b64)
is_gzipped = get_group('gzip', m)
if is_gzipped:
query_string_byte = zlib.decompress(query_string_byte)
query_string = query_string_byte.decode(encoding='utf-8')
except:
traceback.print_exc()
return None
result = urljoin(real_request_url_no_query, '?' + query_string)
# dbgprint('extract:', embedded_url, 'to', result)
return result
@lru_cache(maxsize=1024)
def embed_real_url_to_embedded_url(real_url_raw, url_mime, escape_slash=False):
"""
将url的参数(?q=some&foo=bar)编码到url路径中, 并在url末添加一个文件扩展名
在某些对url参数支持不好的CDN中, 可以减少错误
`cdn_redirect_encode_query_str_into_url`设置依赖于本函数, 详细说明可以看配置文件中的对应部分
解码由 extract_real_url_from_embedded_url() 函数进行, 对应的例子也请看这个函数
"""
# dbgprint(real_url_raw, url_mime, escape_slash)
if escape_slash:
real_url = real_url_raw.replace(r'\/', '/')
else:
real_url = real_url_raw
url_sp = urlsplit(real_url)
if not url_sp.query: # no query, needn't rewrite
return real_url_raw
try:
byte_query = url_sp.query.encode()
if len(byte_query) > 128: # 当查询参数太长时, 进行gzip压缩
gzip_label = 'z' # 进行压缩后的参数, 会在标识区中添加一个z
byte_query = zlib.compress(byte_query)
else:
gzip_label = ''
b64_query = base64.urlsafe_b64encode(byte_query).decode()
# dbgprint(url_mime)
mixed_path = url_sp.path + '_' + _url_salt + gzip_label + '_.' \
+ b64_query \
+ '._' + _url_salt + '_.' + mime_to_use_cdn[url_mime]
result = urlunsplit((url_sp.scheme, url_sp.netloc, mixed_path, '', ''))
except:
traceback.print_exc()
return real_url_raw
else:
if escape_slash:
result = result.replace('/', r'\/')
# dbgprint('embed:', real_url_raw, 'to:', result)
return result
def decode_mirror_url(mirror_url=None):
"""
解析镜像url(可能含有extdomains), 并提取出原始url信息
可以不是完整的url, 只需要有 path 部分即可(query_string也可以有)
若参数留空, 则使用当前用户正在请求的url
支持json (处理 \/ 和 \. 的转义)
:param mirror_url:
:return: dict(domain, is_https, path, path_query)
:rtype: {'domain':str, 'is_https':bool, 'path':str, 'path_query':str}
"""
_is_escaped_dot = False
_is_escaped_slash = False
result = {}
if mirror_url is None:
input_path_query = extract_url_path_and_query()
else:
if r'\/' in mirror_url: # 如果 \/ 在url中, 先反转义, 处理完后再转义回来
_is_escaped_slash = True
mirror_url = mirror_url.replace(r'\/', '/')
if r'\.' in mirror_url: # 如果 \. 在url中, 先反转义, 处理完后再转义回来
_is_escaped_dot = True
mirror_url = mirror_url.replace(r'\.', '.')
input_path_query = extract_url_path_and_query(mirror_url)
if input_path_query[:12] == '/extdomains/':
# 12 == len('/extdomains/')
domain_end_pos = input_path_query.find('/', 12)
real_domain = input_path_query[12:domain_end_pos]
real_path_query = input_path_query[domain_end_pos:]
if real_domain[:6] == 'https-':
real_domain = real_domain[6:]
_is_https = True
else:
_is_https = False
real_path_query = client_requests_text_rewrite(real_path_query)
if _is_escaped_dot: real_path_query = real_path_query.replace('.', r'\.')
if _is_escaped_slash: real_path_query = real_path_query.replace('/', r'\/')
result['domain'] = real_domain
result['is_https'] = _is_https
result['path_query'] = real_path_query
result['path'] = urlsplit(result['path_query']).path
return result
input_path_query = client_requests_text_rewrite(input_path_query)
if _is_escaped_dot: input_path_query = input_path_query.replace('.', r'\.')
if _is_escaped_slash: input_path_query = input_path_query.replace('/', r'\/')
result['domain'] = target_domain
result['is_https'] = (target_scheme == 'https://')
result['path_query'] = input_path_query
result['path'] = urlsplit(result['path_query']).path
return result
# 函数别名, 为了兼容早期版本的配置文件
extract_from_url_may_have_extdomains = decode_mirror_url
# noinspection PyShadowingNames
def encode_mirror_url(raw_url_or_path, remote_domain=None, is_scheme=None, is_escape=False):
"""convert url from remote to mirror url"""
if is_escape:
_raw_url_or_path = raw_url_or_path.replace('r\/', r'/')
else:
_raw_url_or_path = raw_url_or_path
sp = urlsplit(_raw_url_or_path)
if '/extdomains/' == sp.path[:12]:
return raw_url_or_path
domain = remote_domain or sp.netloc or this_request.remote_domain or target_domain
if domain not in allowed_domains_set:
return raw_url_or_path
if is_scheme or ((sp.scheme or _raw_url_or_path[:2] == '//') and is_scheme is not False):
our_prefix = myurl_prefix
else:
our_prefix = ''
if domain not in domain_alias_to_target_set:
remote_scheme = get_ext_domain_inurl_scheme_prefix(domain)
middle_part = '/extdomains/' + remote_scheme + domain
else:
middle_part = ''
result = urljoin(our_prefix + middle_part + '/',
extract_url_path_and_query(_raw_url_or_path).lstrip('/'))
if is_escape:
result = result.replace('/', r'\/')
return response_text_rewrite(result)
# 函数别名, 为了兼容早期版本的配置文件
convert_to_mirror_url = encode_mirror_url
def get_ext_domain_inurl_scheme_prefix(ext_domain, force_https=None):
"""根据域名返回其在镜像url中的https中缀(或没有)"""
if force_https is not None:
if force_https:
return 'https-'
else:
return ''
if force_https_domains == 'NONE':
return ''
if force_https_domains == 'ALL':
return 'https-'
if ext_domain in force_https_domains:
return 'https-'
else:
return ''
def add_ssrf_allowed_domain(domain):
"""添加域名到ssrf白名单, 不支持通配符"""
global allowed_domains_set
allowed_domains_set.add(domain)
# noinspection PyGlobalUndefined
def set_request_for_debug(dummy_request):
global request
request = dummy_request
def strx(*args, sep=' '):
output = ''
for arg in args:
output += str(arg) + sep
output.rstrip(sep)
return output
@lru_cache(maxsize=1024)
def check_global_ua_pass(ua_str):
"""该user-agent是否满足全局白名单"""
if ua_str is None or not global_ua_white_name:
return False
ua_str = ua_str.lower()
if global_ua_white_name in ua_str:
return True
else:
return False
@lru_cache(maxsize=128)
def is_mime_represents_text(input_mime):
"""
Determine whether an mime is text (eg: text/html: True, image/png: False)
:param input_mime: str
:return: bool
"""
input_mime_l = input_mime.lower()
for text_word in text_like_mime_keywords:
if text_word in input_mime_l:
return True
return False
@lru_cache(maxsize=128)
def extract_mime_from_content_type(_content_type):
"""从content-type中提取出mime, 如 'text/html; encoding=utf-8' --> 'text/html' """
c = _content_type.find(';')
if c == -1:
return _content_type
else:
return _content_type[:c]
@lru_cache(maxsize=128)
def is_content_type_using_cdn(_content_type):
"""根据content-type确定该资源是否使用CDN"""
_mime = extract_mime_from_content_type(_content_type)
if _mime in mime_to_use_cdn:
# dbgprint(content_type, 'Should Use CDN')
return _mime
else:
# dbgprint(content_type, 'Should NOT CDN')
return False
def generate_simple_resp_page(errormsg=b'We Got An Unknown Error', error_code=500):
return make_response(errormsg, error_code)
def generate_html_redirect_page(target_url, msg='', delay_sec=1):
"""生成一个HTML重定向页面
某些浏览器在301/302页面不接受cookies, 所以需要用html重定向页面来传cookie"""
resp_content = r"""<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>重定向 (Page Redirect)</title>
<meta http-equiv="refresh" content="%d; url=%s">
<script>setTimeout(function(){location.href="%s"} , %d000);</script>
</head>
<body>
<pre>%s</pre>
<hr />
You are now redirecting to <a href="%s">%s</a>, if it didn't redirect automatically, please click that link.
</body>
</html>""" % (
delay_sec, html_escape(target_url), html_escape(target_url), delay_sec + 1,
html_escape(msg), html_escape(target_url), html_escape(target_url)
)
resp_content = resp_content.encode('utf-8')
return Response(response=resp_content)
def generate_304_response(_content_type=None):
r = Response(content_type=_content_type, status=304)
r.headers.add('X-Cache', 'FileHit-304')
return r
def generate_ip_verify_hash(input_dict):
"""
生成一个标示用户身份的hash
在 human_ip_verification 功能中使用
hash一共14位
hash(前7位+salt) = 后7位 以此来进行验证
"""
strbuff = human_ip_verification_answers_hash_str
for key in input_dict:
strbuff += key + input_dict[key] + str(random.randint(0, 9000000))
input_key_hash = hex(zlib.adler32(strbuff.encode(encoding='utf-8')))[2:]
while len(input_key_hash) < 7:
input_key_hash += '0'
output_hash = hex(zlib.adler32((input_key_hash + human_ip_verification_answers_hash_str).encode(encoding='utf-8')))[2:]
while len(output_hash) < 7:
output_hash += '0'
return input_key_hash + output_hash
@lru_cache(maxsize=1024)
def verify_ip_hash_cookie(hash_cookie_value):
"""
根据cookie中的hash判断是否允许用户访问
在 human_ip_verification 功能中使用
hash一共14位
hash(前7位+salt) = 后7位 以此来进行验证
:type hash_cookie_value: str
"""
try:
input_key_hash = hash_cookie_value[:8]
output_hash = hash_cookie_value[8:]
calculated_hash = hex(zlib.adler32(
(input_key_hash + human_ip_verification_answers_hash_str).encode(encoding='utf-8')
))[2:]
if output_hash == calculated_hash:
return True
else:
return False
except:
return False
def update_content_in_local_cache(url, content, method='GET'):
"""更新 local_cache 中缓存的资源, 追加content
在stream模式中使用"""
if local_cache_enable and method == 'GET' and cache.is_cached(url):
info_dict = cache.get_info(url)
resp = cache.get_obj(url)
resp.set_data(content)
# 当存储的资源没有完整的content时, without_content 被设置为true
# 此时该缓存不会生效, 只有当content被添加后, 缓存才会实际生效
# 在stream模式中, 因为是先接收http头, 然后再接收内容, 所以会出现只有头而没有内容的情况
# 此时程序会先将只有头部的响应添加到本地缓存, 在内容实际接收完成后再追加内容
info_dict['without_content'] = False
if verbose_level >= 4: dbgprint('LocalCache_UpdateCache', url, content[:30], len(content))
cache.put_obj(
url,
resp,
obj_size=len(content),
expires=get_expire_from_mime(this_request.mime),
last_modified=info_dict.get('last_modified'),
info_dict=info_dict,
)
def put_response_to_local_cache(url, _our_resp, without_content=False):
"""
put our response object(headers included) to local cache
:param without_content: for stream mode use
:param url: client request url
:param _our_resp: our response(flask response object) to client, would be storge
:return: None
"""
# Only cache GET method, and only when remote returns 200(OK) status
if local_cache_enable and request.method == 'GET' and this_request.remote_response.status_code == 200:
if without_content:
our_resp = copy.copy(_our_resp)
our_resp.response = None # delete iterator
else:
our_resp = _our_resp
# the header's character cases are different in flask/apache(win)/apache(linux)
last_modified = this_request.remote_response.headers.get('last-modified', None) \
or this_request.remote_response.headers.get('Last-Modified', None)
dbgprint('PuttingCache:', url)
cache.put_obj(
url,
our_resp,
expires=get_expire_from_mime(this_request.mime),
obj_size=0 if without_content else len(this_request.remote_response.content),
last_modified=last_modified,
info_dict={'without_content': without_content,
'last_modified': last_modified,
},
)
def try_get_cached_response(url, client_header=None):
"""
尝试从本地缓存中取出响应
:param url: real url with query string
:type client_header: dict
"""
# Only use cache when client use GET
if local_cache_enable and request.method == 'GET' and cache.is_cached(url):
if client_header is not None and 'if-modified-since' in client_header and \
cache.is_unchanged(url, client_header.get('if-modified-since', None)):
dbgprint('FileCacheHit-304', url)
return generate_304_response()
else:
cached_info = cache.get_info(url)
if cached_info.get('without_content', False):
# 关于 without_content 的解释, 请看update_content_in_local_cache()函数
return None
# dbgprint('FileCacheHit-200')
resp = cache.get_obj(url)
assert isinstance(resp, Response)
resp.headers.set('x-zmirror-cache', 'FileHit')
return resp
else:
return None
def get_group(name, match_obj):
"""return a blank string if the match group is None"""
try:
obj = match_obj.group(name)
except:
return ''
else:
if obj is not None:
return obj
else:
return ''
def regex_url_reassemble(match_obj):
"""
Reassemble url parts split by the regex.
:param match_obj: match object of stdlib re
:return: re assembled url string (included prefix(url= etc..) and suffix.)
"""
if match_obj.group() in url_rewrite_cache: # Read Cache
global url_rewrite_cache_hit_count
url_rewrite_cache_hit_count += 1
return url_rewrite_cache[match_obj.group()]
else:
global url_rewrite_cache_miss_count
prefix = get_group('prefix', match_obj)
quote_left = get_group('quote_left', match_obj)
quote_right = get_group('quote_right', match_obj)
path = get_group('path', match_obj)
match_domain = get_group('domain', match_obj)
scheme = get_group('scheme', match_obj)
whole_match_string = match_obj.group()
# dbgprint('prefix', prefix, 'quote_left', quote_left, 'quote_right', quote_right,
# 'path', path, 'match_domain', match_domain, 'scheme', scheme, 'whole', whole_match_string)
if r"\/" in path or r"\/" in scheme:
require_slash_escape = True
path = path.replace(r"\/", "/")
# domain_and_scheme = domain_and_scheme.replace(r"\/", "/")
else:
require_slash_escape = False
# path must be not blank
if (not path # path is blank
# only url(something) and @import are allowed to be unquoted
or ('url' not in prefix and 'import' not in prefix) and (not quote_left or quote_right == ')')
# for "key":"value" type replace, we must have at least one '/' in url path (for the value to be regard as url)
or (':' in prefix and '/' not in path)
# if we have quote_left, it must equals to the right
or (quote_left and quote_left != quote_right)
# in javascript, those 'path' contains one or only two slash, should not be rewrited (for potential error)
# or (this_request.mime == 'application/javascript' and path.count('/') < 2)
# in javascript, we only rewrite those with explicit scheme ones.
# v0.21.10+ in "key":"value" format, we should ignore those path without scheme
or (not scheme and ('javascript' in this_request.mime or '"' in prefix))
):
# dbgprint('returned_un_touch', whole_match_string)
return whole_match_string
# v0.19.0+ Automatic Domains Whitelist (Experimental)
if enable_automatic_domains_whitelist:
try_match_and_add_domain_to_rewrite_white_list(match_domain)
# dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5)
# dbgprint(match_obj.groups(), v=5)
# dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5)
domain = match_domain or this_request.remote_domain
# dbgprint('rewrite match_obj:', match_obj, 'domain:', domain, v=5)
# skip if the domain are not in our proxy list
if domain not in allowed_domains_set:
# dbgprint('return untouched because domain not match', domain, whole_match_string)
return match_obj.group() # return raw, do not change
# this resource's absolute url path to the domain root.
# dbgprint('match path', path, v=5)
path = urljoin(this_request.remote_path, path)
# dbgprint('middle path', path, v=5)
if ':' not in this_request.remote_domain: # the python's builtin urljoin has a bug, cannot join domain with port correctly
url_no_scheme = urljoin(domain + '/', path.lstrip('/'))
else:
url_no_scheme = domain + '/' + path.lstrip('/')
# dbgprint('url_no_scheme', url_no_scheme)