diff --git a/app/spider/common/common.go b/app/spider/common/common.go index 5918a1ee..d14372fd 100644 --- a/app/spider/common/common.go +++ b/app/spider/common/common.go @@ -18,22 +18,22 @@ import ( func CleanHtml(str string, depth int) string { if depth > 0 { //将HTML标签全转换成小写 - re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") + re, _ := regexp.Compile("<[\\S\\s]+?>") str = re.ReplaceAllStringFunc(str, strings.ToLower) } if depth > 1 { //去除STYLE - re, _ := regexp.Compile("\\") + re, _ := regexp.Compile("") str = re.ReplaceAllString(str, "") } if depth > 2 { //去除SCRIPT - re, _ := regexp.Compile("\\") + re, _ := regexp.Compile("") str = re.ReplaceAllString(str, "") } if depth > 3 { //去除所有尖括号内的HTML代码,并换成换行符 - re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") + re, _ := regexp.Compile("<[\\S\\s]+?>") str = re.ReplaceAllString(str, "\n") } if depth > 4 { @@ -232,7 +232,7 @@ func ProcessHtml(html string) string { //html = re.ReplaceAllString(html, "") //将HTML标签全转换成小写 - //re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") + //re, _ = regexp.Compile("<[\\S\\s]+?>") //html = re.ReplaceAllStringFunc(html, strings.ToLower) //去除连续的换行符