利用正则表达式查找HTML源代码中的所有IMG的图片连接地址:
procedure GetImgLinks(Links: TStrings; var s: widestring);
var
buffer: TStrings;
i : Integer;
tmps, path, fs: string;
Comma : char;
URL: array[0..4095] of char;
Matchs, FRegExp : OleVariant;
begin
path := IncludeTrailingPathDelimiter(GetTempPathEx);
FRegExp := CreateOleObject('VBScript.RegExp');
FRegExp.Global := True;
FRegExp.IgnoreCase := True;
buffer := TStringList.Create;
try
// 利用正则表达式查找所有的 img 及其 src 链接
FRegExp.Pattern := '<img.*?\ssrc=([\""\''])([^\""\'']+?)\1.*?>';
//FRegExp.Pattern := CSImgUrl3;
Matchs := FRegExp.Execute(s);
if Matchs.Count > 0 then
begin
for i := 0 to Matchs.Count - 1 do
buffer.Add(FRegExp.Replace(Matchs.Item[i].Value, '$2'));
end;
for i := 0 to buffer.Count - 1 do
begin
tmps := ExtractFileName(UnixToDosPath(buffer[i]));
ShowHintMessage('Downloading [' + tmps + ']...');
if DownloadFile(buffer[i], path + tmps) then
begin
s := WideReplaceText(s, buffer[i], tmps);
Links.Add(path + tmps);
end;
end;
finally
buffer.Free;
end;
end;
--------------------------------------------
1.31修正
src=后面有空格不能正确匹配.已修正.
src=''为空时出错.已修正.
发现BUG: 图片路径有多个空格时只能保留一个.未修正.
2.18修正
图片路径有多个空格时只能保留一个的BUG.已修正.
<%
'功能:获取全部图片地址,保存到一个数组.
'来源: http://jorkin.reallydo.com/article.asp?id=448
'需要ReplaceAll函数: http://jorkin.reallydo.com/article.asp?id=406
Function getIMG(sString)
Dim sReallyDo, regEx, iReallyDo
Dim oMatches, cMatch
'//定义一个空数组
iReallyDo = -1
ReDim aReallyDo(iReallyDo)
If IsNull(sString) Then
getIMG = aReallyDo
Exit Function
End If
'//格式化HTML代码
'//将每个 <img 换行 方便正则替换
sReallyDo = sString
On Error Resume Next
sReallyDo = Replace(sReallyDo, vbCr, " ")
sReallyDo = Replace(sReallyDo, vbLf, " ")
sReallyDo = Replace(sReallyDo, vbTab, " ")
sReallyDo = Replace(sReallyDo, "<img ", vbCrLf & "<img ", 1, -1, 1)
sReallyDo = Replace(sReallyDo, "/>", " />", 1, -1, 1)
sReallyDo = ReplaceAll(sReallyDo, "= ", "=", True)
sReallyDo = ReplaceAll(sReallyDo, "> ", ">", True)
sReallyDo = Replace(sReallyDo, "><", ">" & vbCrLf & "<")
sReallyDo = Trim(sReallyDo)
Set regEx = New RegExp
regEx.IgnoreCase = True
regEx.Global = True
'//去除onclick,onload等脚本
regEx.Pattern = "\s[on].+?=([\""|\'])(.*?)\1"
sReallyDo = regEx.Replace(sReallyDo, "")
'//将SRC不带引号的图片地址加上引号
regEx.Pattern = "<img.*?\ssrc=([^\""\'\s][^\""\'\s>]*).*?>"
sReallyDo = regEx.Replace(sReallyDo, "<img src=""$1"" />")
'//正则匹配图片SRC地址
regEx.Pattern = "<img.*?\ssrc=([\""\'])([^\""\']+?)\1.*?>"
Set oMatches = regEx.Execute(sReallyDo)
'//将图片地址存入数组
For Each cMatch in oMatches
iReallyDo = iReallyDo + 1
ReDim Preserve aReallyDo(iReallyDo)
aReallyDo(iReallyDo) = regEx.Replace(cMatch.Value, "$2")
Next
getIMG = aReallyDo
End Function
%>
<%
'用法:很多人都问我怎么用,其实数组怎么用的这个就怎么用.
sContent = "HTML代码字段" '//sContent代表Html代码,原来写个ors怎么就不懂是记录集呢。。。
Dim aImages : aImages = GetImg(sContent) '//定义一个数组,并且用来存放所有分析到的图片地址
'列出所有图片的地址:
For i = 0 To UBound(aImages)
Response.Write("<b>第" & i + 1 & "张图片地址:</b> " & aImages(i) & "<br />")
Next
'列出第一张图片地址:
If UBound(aImages)> -1 Then Response.Write("<p><b>第一张图片地址:</b> " & aImages(0) & "</p>")
'列出最后一张图片地址:
If UBound(aImages)> -1 Then Response.Write("<p><b>最后一张图片地址:</b> " & aImages(UBound(aImages)) & "</p>")
%>
本文来源于 KinJAVA日志 ( http://jorkin.reallydo.com )