u1timate
Published on 2022-06-26 / 194 Visits
0

URL验证正则表达式

0x01 正则表达式

网上找了好久都没有找到满足需求的。以下正则表达式是根据网上总结编写的

^http(?:s?):\/\/((?:(?:[a-zA-Z0-9\.\-\_]+(?:\.[a-zA-Z]{2,5})+)(?::(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[0-9]\d{0,3}))?|(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b))(?::(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[0-9]\d{0,3}))?)(\/[a-zA-Z0-9\_\-\s\.\/\?\%\#\&\=]*)?$

该正则表达式能正确识别IP地址、域名和端口是否合法,但是如果URL中带有中文字符,则该正则表达式无法进行有效验证

0x02 使用

Golang

pattern := `^http(?:s?):\/\/((?:(?:[a-zA-Z0-9\.\-\_]+(?:\.[a-zA-Z]{2,5})+)(?::(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[0-9]\d{0,3}))?|(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b))(?::(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[0-9]\d{0,3}))?)(\/[a-zA-Z0-9\_\-\s\.\/\?\%\#\&\=]*)?$`
reg := regexp.MustCompile(pattern)
s = "https://www.baidu.com:4480/zh_ch_aa/index.html"
//查找分组结果
res := reg.FindAllStringSubmatch(s, -1)
fmt.Printf(">>%+v\n", s, res)  
//>>[[https://www.baidu.com:4480/zh_ch_aa/index.html www.baidu.com:4480 /zh_ch_aa/index.html]]

//IP错误,返回空数组
s = "https://10.30.55.1111:4480/zh_ch_aa/index.html"
res = reg.FindAllStringSubmatch(s, -1) 
fmt.Printf(">>%+v\n", s, res) //>>[]

//端口错误,超出65535,返回空数组
s = "https://10.30.55.1111:65537/zh_ch_aa/index.html"
res = reg.FindAllStringSubmatch(s, -1) 
fmt.Printf(">>%+v\n", s, res) //>>[]

//中文无法识别,会返回空数组
s = "http://111.111.111.111:3333/zh_ch_aa/好.html"
res = reg.FindAllStringSubmatch(s, -1) 
fmt.Printf(">>%+v\n", s, res) //>>[]

匹配失败会返回一个空数组,成功则会返回一个二维数组,数组按照顺序包含的元素为 原字符串,主机Host和URL Path(如果有的情况下)如下
[[https://www.baidu.com:4480/zh_ch_aa/index.html www.baidu.com:4480 /zh_ch_aa/index.html]]

JavaScript

var pattern = /^http(?:s?):\/\/((?:(?:[a-zA-Z0-9\.\-\_]+(?:\.[a-zA-Z]{2,5})+)(?::(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[0-9]\d{0,3}))?|(?:\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b))(?::(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[0-9]\d{0,3}))?)(\/[a-zA-Z0-9\_\-\s\.\/\?\%\#\&\=]*)?$/
var re = new RegExp(pattern);
re.exec("http://1.1.1.1:555/index/id=2");
//输出:['http://1.1.1.1:555/index/id=2', '1.1.1.1:555', '/index/id=2', index: 0, input: 'http://1.1.1.1:555/index/id=2', groups: undefined]