<?php
if ($argc != 2) {
echo "Usage: php script.php <url>\n";
exit(1);
}
$url = $argv[1];
// 设置模拟 Chrome 的 User-Agent
$options = [
'http' => [
'header' => 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
],
];
$context = stream_context_create($options);
try {
// 获取网页内容
$html = file_get_contents($url, false, $context);
} catch (Exception $e) {
echo "Error fetching the URL: {$e->getMessage()}\n";
exit(1);
}
// 定义匹配img标签的正则表达式
$pattern = '/<img[^>]*\bsrc=["\']([^"\']+)/i';
// 执行正则匹配
preg_match_all($pattern, $html, $matches);
// 提取匹配结果
$imgLinks = $matches[1];
// 过滤掉base64编码的图片链接和非http(s)开头的链接
$imgLinks = array_filter($imgLinks, function($link) {
return strpos($link, 'data:image') !== 0 && (strpos($link, 'http://') === 0 || strpos($link, 'https://') === 0);
});
// 输出所有链接,每行一个
foreach ($imgLinks as $link) {
echo $link . "\n";
}
?>
import sys
import re
import requests
def get_img_links(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
# 获取网页内容
response = requests.get(url, headers=headers)
response.raise_for_status()
html = response.text
except requests.RequestException as e:
print(f"Error fetching the URL: {e}")
sys.exit(1)
# 定义匹配img标签的正则表达式
pattern = r'<img[^>]*\bsrc=["\']([^"\']+)'
# 执行正则匹配
matches = re.findall(pattern, html)
# 过滤掉base64编码的图片链接和非http(s)开头的链接
img_links = [link for link in matches if not link.startswith('data:image') and (link.startswith('http://') or link.startswith('https://'))]
return img_links
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <url>")
sys.exit(1)
url = sys.argv[1]
img_links = get_img_links(url)
# 输出所有链接,每行一个
for link in img_links:
print(link)