diff --git a/.travis.yml b/.travis.yml index 529d8cf2d..a85dd8e93 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,13 +4,9 @@ python: - "3.5" - "3.6" - "3.7" - - "3.8" - - "3.9" - - "3.10" - - "3.11" os: - linux install: - pip install -r requirements.txt -script: python test.py +script: python test.py \ No newline at end of file diff --git "a/Proxy_Pool \344\273\243\347\220\206\346\261\240\351\241\271\347\233\256\345\277\253\351\200\237\346\220\255\345\273\272.md" "b/Proxy_Pool \344\273\243\347\220\206\346\261\240\351\241\271\347\233\256\345\277\253\351\200\237\346\220\255\345\273\272.md" new file mode 100644 index 000000000..a59f8266f --- /dev/null +++ "b/Proxy_Pool \344\273\243\347\220\206\346\261\240\351\241\271\347\233\256\345\277\253\351\200\237\346\220\255\345\273\272.md" @@ -0,0 +1,134 @@ + + +# Proxy_Pool 代理池项目快速搭建 + +## 环境准备 + +* Linux x64 +* docker or 本地 +* git 工具 +* python3 + + + +## 项目搭建 + +### Linux 配置 + +1. 防火墙关闭 + +```bash +systemctl stop firewalld // 关闭防火墙 +systemctl disable firewalld // 禁用防火墙 +``` + +2. 安装python3 + +``` +sudo yum install python3 python3-pip +``` + +3. 关闭selinux + +``` +sudo vim /etc/selinux/config +# 修改SELINUX的值为disabled后重启系统 +SELINUX=disabled +``` + + + + +### docker 安装 + +1. 一键自动化安装: + +```bash +curl -fsSL https://get.docker.com | bash -s docker --mirror Aliyun +``` + +2. 启动docker + +```bash +sudo systemctl start docker +sudo systemctl enable docker +``` + +3. docker切换镜像源 + +```bash + sudo mkdir -p /etc/docker + sudo tee /etc/docker/daemon.json <<-'EOF' + { + "registry-mirrors": ["https://yytcclg8.mirror.aliyuncs.com"] + } + EOF + sudo systemctl daemon-reload + sudo systemctl restart docker +``` + + + +### Redis 安装 + +1. 利用docker安装redis + +```dockerfile +sudo docker pull redis +``` + +2. 启动redis +```bash +sudo docker run -d --name redis -p 6379:6379 redis --requirepass "password" +-p 端口 +-requirepass "密码" +``` + + + +### Proxy_Pool 安装 + +1. 项目下载 + +```git +git clone https://github.com/jhao104/proxy_pool.git +``` + +2. 安装项目依赖 + +``` +python3 -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +``` + + + +2. 配置修改 + +```bash +sudo vim setting.py + +# 修改配置信息 +DB_CONN = 'redis://:pwd@127.0.0.1:6379/0' #修改对应的配置信息 +``` + + +```bash +sudo vim star.sh + +python3 proxyPool.py server & +python3 proxyPool.py schedule # 修改对应python3 执行名称 +``` + +3. 运行程序 + +```bash +yum install screen #安装后台运行程序 +screen -S proxy_pool #创建一个名为proxy_pool的终端 +./start.sh # 启动程序 +ctrl+a+d #返回主终端,proxy_pool终端进入后台运行 +``` + +4. 运行成果 +![](https://s2.loli.net/2022/04/06/KTuQ2yHS7zPj8mW.png) + +![image-20220406110323384](https://s2.loli.net/2022/04/06/js5Og48puJzQ6Tt.png) \ No newline at end of file diff --git a/README.md b/README.md index 6a009b897..06c4916bf 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ - + ProxyPool 爬虫代理IP池 ======= [![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) +[![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) [![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) @@ -22,18 +23,11 @@ ProxyPool 爬虫代理IP池 * 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) -* 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) -[![](https://img.shields.io/badge/Python-3.5-blue.svg)](https://docs.python.org/3.5/) -[![](https://img.shields.io/badge/Python-3.6-blue.svg)](https://docs.python.org/3.6/) -[![](https://img.shields.io/badge/Python-3.7-blue.svg)](https://docs.python.org/3.7/) -[![](https://img.shields.io/badge/Python-3.8-blue.svg)](https://docs.python.org/3.8/) -[![](https://img.shields.io/badge/Python-3.9-blue.svg)](https://docs.python.org/3.9/) -[![](https://img.shields.io/badge/Python-3.10-blue.svg)](https://docs.python.org/3.10/) -[![](https://img.shields.io/badge/Python-3.11-blue.svg)](https://docs.python.org/3.11/) +* 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) * 测试地址: http://demo.spiderpy.cn (勿压谢谢) -* 付费代理推荐: [luminati-china](https://get.brightdata.com/github_jh). 国外的亮数据BrightData(以前叫luminati)被认为是代理市场领导者,覆盖全球的7200万IP,大部分是真人住宅IP,成功率扛扛的。付费套餐多种,需要高质量代理IP的可以注册后联系中文客服。[申请免费试用](https://get.brightdata.com/github_jh) 现在有首充多少送多少的活动。(PS:用不明白的同学可以参考这个[使用教程](https://www.cnblogs.com/jhao/p/15611785.html))。 +* 付费代理推荐: [luminati-china](https://brightdata.grsm.io/proxyPool). 国外的亮数据BrightData(以前叫luminati)被认为是代理市场领导者,覆盖全球的7200万IP,大部分是真人住宅IP,成功率扛扛的。付费套餐多种,需要高质量代理IP的可以注册后联系中文客服,开通后有5美金赠送和教程指引(PS:用不明白的同学可以参考这个[使用教程](https://www.cnblogs.com/jhao/p/15611785.html))。 ### 运行项目 @@ -98,6 +92,50 @@ python proxyPool.py server ``` + +# 关于 Py 312 报错的问题解决: +1. from imp import reload as reload_six +ModuleNotFoundError: No module named 'imp' +解决方法:if PY3: +import importlib +reload_six = importlib.reload +else: +reload_six = reload + +2. from collections import MutableMapping +ImportError: cannot import name 'MutableMapping' from 'collections' +解决方法:from collections.abc import MutableMapping + +3. from collections import Iterable, Mapping +ImportError: cannot import name 'Iterable' from 'collections' +解决方法:from collections.abc import Iterable, Mapping + +4. from .packages.six.moves.http_client import ( +ModuleNotFoundError: No module named 'urllib3.packages.six.moves' +解决方法:更新urllib3包到最新版本 + +处理完这些问题 python proxyPool.py schedule 就正常启动了。 +------------------------------------------------------------------------------- + +* 执行 python proxyPool.py server 也会报好多错: + +1.from jinja2 import Markup, escape +ImportError: cannot import name 'Markup' from 'jinja2' +解决方法:改成 from jinja2 import pass_eval_context +from markupsafe import Markup, escape + +2.from itsdangerous import json as _json +ImportError: cannot import name 'json' from 'itsdangerous' +解决方法:改成 import json as _json + +3.from collections import MutableMapping +ImportError: cannot import name 'MutableMapping' from 'collections' +解决方法:改成 from collections.abc import MutableMapping + +然后就可以 get 到 代理地址 了 + + + ### Docker Image ```bash @@ -205,19 +243,18 @@ PROXY_FETCHER = [ 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): - | 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 | - |---------------| ---- | -------- | ------ | ----- |------------------------------------------------| - | 站大爷 | ✔ | ★ | ** | [地址](https://www.zdaye.com/) | [`freeProxy01`](/fetcher/proxyFetcher.py#L28) | - | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | - | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | - | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | - | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | - | 冰凌代理 | ✔ | ★★★ | * | [地址](https://www.binglx.cn/) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | - | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L123) | - | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L133) | - | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L143) | - | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L154) | - | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L164) | + | 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 | + | --------- | ---- | -------- | ------ | ----- | ------- | + | 站大爷 | ✔ | ★ | ** | [地址](https://www.zdaye.com/) | [`freeProxy01`](/fetcher/proxyFetcher.py#L28) | + | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | + | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | + | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | + | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | + | FateZero | ✔ | ★★ | * | [地址](http://proxylist.fatezero.org) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | + | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L124) | + | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L134) | + | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L144) | + | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L155) | 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 @@ -242,5 +279,3 @@ PROXY_FETCHER = [ ### Release Notes [changelog](https://github.com/jhao104/proxy_pool/blob/master/docs/changelog.rst) - -Featured|HelloGitHub diff --git a/docs/changelog.rst b/docs/changelog.rst index e3889882c..d77c65dc2 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,13 +3,6 @@ ChangeLog ========== -2.4.2 (2024-01-18) ------------------- - -1. 代理格式检查支持需认证的代理格式 `username:password@ip:port` ; (2023-03-10) -2. 新增代理源 **稻壳代理**; (2023-05-15) -3. 新增代理源 **冰凌代理**; (2023-01-18) - 2.4.1 (2022-07-17) ------------------ diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index cfc37f928..2ddcad9bd 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -30,14 +30,14 @@ def freeProxy01(): 站大爷 https://www.zdaye.com/dayProxy.html """ start_url = "https://www.zdaye.com/dayProxy.html" - html_tree = WebRequest().get(start_url, verify=False).tree + html_tree = WebRequest().get(start_url).tree latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() from datetime import datetime interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") if interval.seconds < 300: # 只采集5分钟内的更新 target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() while target_url: - _tree = WebRequest().get(target_url, verify=False).tree + _tree = WebRequest().get(target_url).tree for tr in _tree.xpath("//table//tr"): ip = "".join(tr.xpath("./td[1]/text()")).strip() port = "".join(tr.xpath("./td[2]/text()")).strip() @@ -109,13 +109,14 @@ def freeProxy05(page_count=1): @staticmethod def freeProxy06(): - """ 冰凌代理 https://www.binglx.cn """ - url = "https://www.binglx.cn/?page=1" + """ FateZero http://proxylist.fatezero.org/ """ + url = "http://proxylist.fatezero.org/proxy.list" try: - tree = WebRequest().get(url).tree - proxy_list = tree.xpath('.//table//tr') - for tr in proxy_list[1:]: - yield ':'.join(tr.xpath('./td/text()')[0:2]) + resp_text = WebRequest().get(url).text + for each in resp_text.split("\n"): + json_info = json.loads(each) + if json_info.get("country") == "CN": + yield "%s:%s" % (json_info.get("host", ""), json_info.get("port", "")) except Exception as e: print(e) @@ -144,7 +145,7 @@ def freeProxy09(page_count=1): """ 免费代理库 """ for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) - html_tree = WebRequest().get(url, verify=False).tree + html_tree = WebRequest().get(url).tree for index, tr in enumerate(html_tree.xpath("//table//tr")): if index == 0: continue @@ -160,16 +161,6 @@ def freeProxy10(): for proxy in proxies: yield ':'.join(proxy) - @staticmethod - def freeProxy11(): - """ 稻壳代理 https://www.docip.net/ """ - r = WebRequest().get("https://www.docip.net/data/free.json", timeout=10) - try: - for each in r.json['data']: - yield each['ip'] - except Exception as e: - print(e) - # @staticmethod # def wallProxy01(): # """ diff --git a/helper/validator.py b/helper/validator.py index 136691c2e..a5a21ab68 100644 --- a/helper/validator.py +++ b/helper/validator.py @@ -7,12 +7,12 @@ date: 2021/5/25 ------------------------------------------------- Change Activity: - 2023/03/10: 支持带用户认证的代理格式 username:password@ip:port + 2021/5/25: ------------------------------------------------- """ __author__ = 'JHao' -import re +from re import findall from requests import head from util.six import withMetaclass from util.singleton import Singleton @@ -25,8 +25,6 @@ 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.8'} -IP_REGEX = re.compile(r"(.*:.*@)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}") - class ProxyValidator(withMetaclass(Singleton)): pre_validator = [] @@ -52,7 +50,9 @@ def addHttpsValidator(cls, func): @ProxyValidator.addPreValidator def formatValidator(proxy): """检查代理格式""" - return True if IP_REGEX.fullmatch(proxy) else False + verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" + _proxy = findall(verify_regex, proxy) + return True if len(_proxy) == 1 and _proxy[0] == proxy else False @ProxyValidator.addHttpValidator diff --git a/requirements.txt b/requirements.txt index 53dc129b7..1da597ed2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,8 @@ +APScheduler==3.2.0 +werkzeug==0.15.5 +Flask==1.0 requests==2.20.0 +click==7.0 gunicorn==19.9.0 -lxml==4.9.2 -redis==3.5.3 -APScheduler==3.10.0;python_version>="3.10" -APScheduler==3.2.0;python_version<"3.10" -click==8.0.1;python_version>"3.6" -click==7.0;python_version<="3.6" -Flask==2.1.1;python_version>"3.6" -Flask==1.0;python_version<="3.6" -werkzeug==2.1.0;python_version>"3.6" -werkzeug==0.15.5;python_version<="3.6" +lxml +redis diff --git a/setting.py b/setting.py index 9bab8475c..8bfa0ccf4 100644 --- a/setting.py +++ b/setting.py @@ -30,14 +30,14 @@ # ############### server config ############### HOST = "0.0.0.0" -PORT = 5010 +PORT = 5001 # ############### database config ################### # db connection uri # example: # Redis: redis://:password@ip:port/db # Ssdb: ssdb://:password@ip:port -DB_CONN = 'redis://:pwd@127.0.0.1:6379/0' +DB_CONN = 'redis://:@127.0.0.1:6379/0' # proxy table name TABLE_NAME = 'use_proxy' @@ -54,8 +54,7 @@ "freeProxy07", "freeProxy08", "freeProxy09", - "freeProxy10", - "freeProxy11" + "freeProxy10" ] # ############# proxy validator ################# diff --git a/util/six.py b/util/six.py index 14ee059ba..1d7e3fc02 100644 --- a/util/six.py +++ b/util/six.py @@ -29,8 +29,13 @@ def iteritems(d, **kw): else: from urlparse import urlparse +# if PY3: +# from imp import reload as reload_six +# else: +# reload_six = reload if PY3: - from imp import reload as reload_six + import importlib + reload_six = importlib.reload else: reload_six = reload