先上脚本
[root@mysql scripts]# cat html_to_table.sh
#!/bin/bash
# oldboy linux training
# 2015-06-01
# Happy Children's Day
# 说明:本脚本来自老男孩linux21期学员张耀开发!
EduFile=/tmp/edu.html
EduFile2=/tmp/edu2.html
Url="$*"
# Check for given parameters
[ $# -eq 0 ] && {
echo "USAGE: /bin/sh $0 http://...."
exit 1
}
# Judge url is ok?
curl -I $Url &>/dev/null
[ $? -ne 0 ] &&{
echo "Bad url,Please check it"
exit 1
}
# Defined get pagenum and CourseId Functions
function getnum(){
curl -s $Url>$EduFile
grep '"pagesGoEnd"' $EduFile &>/dev/null
if [ $? -eq 0 ]
then
num=`sed -rn 's#.*page=([0-9].*)" class="pagesGoEnd".*$#\1#gp' $EduFile`
else
num=`sed -rn 's|.*page=([0-9].*)#" class="pagesNum".*$|\1|gp' $EduFile`
fi
pagenum=${num:-1}
CourseId=`echo $Url|awk -F "[-.]" '{print $4}'`
}
# Defined curl html Functions
function Curl(){
getnum
for i in `seq $pagenum`
do
curl "http://edu.51cto.com/index.php?do=course&m=lessions&course_id=$CourseId&page=$i" 1>>$EduFile 2>/dev/null
done
}
# Defined Create table Functions
function table(){
sum=""
index=1
sed -rn '/do=lesson/ s#<.*(<a href=")(.*)</h4>#\1http://edu.51cto.com\2#gp' $EduFile > $EduFile2
while read line
do
sum=$sum"<tr><th width="40" scope="row">$index</th><td width="520">$line</td>"
((index++))
done <$EduFile2
}
# Defined Create html Functions
function html(){
cat >/tmp/oldboy.html<<-END
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>test</title>
</head>
<body>
<table width="560" border="1">
$sum
</table>
</body>
</html>
END
}
function main(){
Curl
table
html
}
main
测试网页http://edu.51cto.com/course/course_id-839.html
1
2
[root@mysql scripts]# sh html_to_table.sh http://edu.51cto.com/course/course_id-839.html
[root@mysql scripts]# sz /tmp/oldboy.html
我将网页文件的table部分粘贴上来
2016-11-16
抓取网页的脚本
评论
发表评论
姓 名: