有一种渴,只有酒才能滋润,这种渴就是孤独。
根据网页返回编码寻找数据
比如我要找到这个网页的标题,那么直接正则匹配
解决办法:
r_port_top = requests.get(url=str('http://'+url), headers=headers, timeout=5)
if r_port_top.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
这种办法就是先判断网页的编码,然后转换之。但是有的时候是utf-8编码就没办法,接下来来个终极版的。
try:
UA = random.choice(headerss)
headers = {'User-Agent': UA}
r_port_top = requests.get(url=str('http://'+url), headers=headers, timeout=5)
if r_port_top.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'GB2312':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'gb2312':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'GBK':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'gbk':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
else:
port_title = re.search('<title>(.*?)</title>', r_port_top.content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
except:
try:
port_title = re.search('<title>(.*?)</title>', r_port_top.content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
except:
port_title = '暂时无法获取网站标题'
使用chardet直接判断转换
上面那个方法实在是太傻了,使用chardet轻松解决网页编码问题。
# -*- coding: utf-8 -*-
# @Time : 2018/5/4 0004 8:55
# @Author : Langzi
# @Blog : www.sxadmin.github.io
# @File : get urls.py
# @Software: PyCharm
import sys
import chardet
import re
import requests
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'https://stackoverflow.com'
d1 = requests.get(url)
print d1.content
if isinstance(d1.content,unicode):
pass
else:
codesty = chardet.detect(d1.content)
a = d1.content.decode(codesty['encoding'])
得到的a就是网页最终编码后的结果,这个时候直接re.search(‘
当然更简单的方式
requests自带的一个api可以快速识别网页编码,然后转换成utf-8编码
import requests
url = 'https://sxadmin.github.io'
r = requests.get(url)
encoing = requests.utils.get_encodings_from_content(r.text)[0]
print(encoing)
res = r.content.decode(encoing,'replace')
# 替换其中异常的编码,这个相对来可能一眼就知道那些字符编码出问题了。
res = r.content.decode(encoing,'ignore')
# 忽略其中有异常的编码,仅显示有效的编码
通过查看该api的源码,得知它实现的原理是用正则表达式获取到网页中的编码
def get_encodings_from_content(content):
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
return (charset_re.findall(content) +
pragma_re.findall(content) +
xml_re.findall(content))
知道原理后,就可以把这个函数拿来移植到自己的功能函数中,也是学到了噢~
获取网页信息
使用chardet库来进行编码判断
如果想要获取网页的标题和内容以及网页中的外链,写了一个类来实现。使用方法如下
d = Get_Info(url='https://sxadmin.github.io')
d1 = d.get_urls()
# 返回这个传入网址中所有的外链,返回对象为列表,如果没有数据返回None,下同
d2 = d.get_infos()
# 返回这个网址中的标题,内容,返回对象为字典
d3 = d.get_ips()
# 返回这个网址的ip和开放端口,返回对象为字典
具体代码如下:
# coding:utf-8
import re
import requests
import time
import socket
from bs4 import BeautifulSoup as bs
import chardet
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
timeout = 3
socket.setdefaulttimeout(timeout)
from requests.packages import urllib3
urllib3.disable_warnings()
ports = [
21,
22,
23,
25,
53,
69,
139,
445,
389,
1433,
1521,
2181,
3306,
3389,
5432,
5984,
6379,
7001,
7002,
8069,
11211,
27017,
27018,
50070,
50030
]
class Get_Info:
def __init__(self,url):
self.url = url
def get_ips(self):
url_port = []
url_port.append(80)
hostname = self.url.replace('http://','').replace('https://','').replace('/','')
url_ip = 'None'
try:
url_ip= socket.gethostbyname(str(hostname))
except:
pass
if url_ip and url_ip!= 'None':
for port in ports:
s = socket.socket()
try:
s.connect((url_ip,port))
url_port.append(port)
except Exception,e:
# print e
pass
finally:
s.close()
if url_ip and url_ip != 'None':
infos = {}
infos['ip'] = str(url_ip)
infos['ports'] = str(url_port)
return infos
else:
return None
def get_infos(self):
try:
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
r = requests.get(url=self.url,headers=headers,verify=False,timeout=5)
url_title,url_content,url_service = '获取失败','获取失败','获取失败'
try:
code = chardet.detect(r.content)['encoding']
bp = bs(r.content.decode(code).encode('utf-8'),'html.parser')
url_title = bp.title.string
url_content = bp.text
url_service = r.headers
except:
url_title = re.search('<title>(.*?)</title>',r.content,re.I).group(1).decode(code).encode('utf-8')
url_content = re.sub('([\.\?\*~!@#{$%\^&\*()-;"<>\[\]}_\+=]|[0-9]|[a-z]|[A-Z])','',r.text)
url_service = r.headers
infos = {}
infos['url'] = r.url
infos['title'] = url_title
url_contents = ''.join(r.text.split()).replace(' ','')
infos['content'] = re.sub('([\.\?\*~!@#{$%\^&\*()-;"<>\[\]}_\+=]|[0-9]|[a-z]|[A-Z])','',url_contents).replace('|','').replace("'",'')
infos['service'] = url_service
if infos:
return infos
else:
return None
except Exception,e:
print e
def get_urls(self):
urlss = []
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
try:
r = requests.get(url=self.url, headers=headers, verify=False, timeout=5)
pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.I)
urls = re.findall(pattern,r.content)
for x in urls:
a1, a2 = x.split('//')[0], x.split('//')[1].split('/')[0]
a3 = ''.join(a1) + '//' + ''.join(a2)
urlss.append(a3.replace("'","").replace('>','').replace('<',''))
if urlss:
return list(set(urlss))
else:
return None
except Exception,e:
print e
pass