

























这几天发现一个很好的图片网站,美女特多
! 就打算下点图片,但是自己下载的话,翻来覆去的太麻烦,所以用找了个蜘蛛来帮忙。
随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
基本就丢了,另外结构上感觉不太满意,所以改改了。
我大致想的采用双线程,一个UI,一个工作线程,抓取方面采用深度优先搜索,基本思路:得到当前网页,提取下载图片,然后正则表达式匹配网址,然后递归处理!在处理过程中,使用一个集合类来收集处理过的网址防止死循环。代码大致如下:
1
public bool Process( WebPageState state )
2
{
3
state.ProcessStarted = true;
4
state.ProcessSuccessfull = false;
5
6
if(level==1)
7
m_baseUri = state.Uri;
8
try
9
{
10
Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
11
12
WebRequest req = WebRequest.Create( state.Uri );
13
WebResponse res = null;
14
15
try
16
{
17
res = req.GetResponse( );
18
19
if ( res is HttpWebResponse )
20
{
21
state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );
22
state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
23
}
24
if ( res is FileWebResponse )
25
{
26
state.StatusCode = "OK";
27
state.StatusDescription = "OK";
28
}
29
30
if ( state.StatusCode.Equals( "OK" ) )
31
{
32
StreamReader sr = new StreamReader( res.GetResponseStream( ) );
33
34
state.Content = sr.ReadToEnd( );
35
36
37
MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
38
string Address;
39
int k=0;
40
for (k = 0; k < m.Count;k++)
41
{
42
43
44
Address = m[k].Groups[1].ToString();
45
Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
46
// statusBar.Text = "Address: " + Address;
47
if (!m_pages.Contains(uri.AbsoluteUri))
48
{
49
m_pages.Add(uri.AbsoluteUri);
50
DownloadImage(state.Uri, Address);
51
if (this.ContentHandler != null)
52
{
53
state.mes.MaxProgress = m.Count;
54
55
state.mes.Progress = k+1;
56
state.mes.Result = state.Uri.AbsoluteUri;
57
state.mes.Status = TaskStatus.Running;
58
state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
59
ContentHandler.Invoke(state);
60
}
61
}
62
63
64
}
65
66
int counter = 0;
67
Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
68
69
while (mm.Success)
70
{
71
Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
72
if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
73
{
74
if (level > 10)
75
return true;
76
counter++;
77
level++;
78
WebPageState statec = new WebPageState(uri);
79
m_pages.Add(uri.AbsoluteUri);
80
Process(statec);
81
}
82
83
84
mm = mm.NextMatch();
85
}
86
87
}
88
89
state.ProcessSuccessfull = true;
90
}
91
catch( Exception ex )
92
{
93
HandleException( ex, state );
94
}
95
finally
96
{
97
if ( res != null )
98
{
99
res.Close( );
100
}
101
}
102
}
103
catch (Exception ex)
104
{
105
Console.WriteLine( ex.ToString( ) );
106
}
107
Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108
109
return state.ProcessSuccessfull;
110
}
111
#endregion
112
113
114
private void DownloadImage(Uri m_bb,string imgUri)
115
{
116
Uri imageUri = null;
117
string ext = null;
118
string outFile = null;
119
120
121
try
122
{
123
imageUri = new Uri(m_bb, imgUri);
124
125
ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126
outFile = "temp\\img" + (m_fileId++) + "." + ext;
127
128
if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129
{
130
WebClient web = new WebClient();
131
web.DownloadFile(imageUri.AbsoluteUri,outFile);
132
// byte[] image=web.DownloadData(imageUri);
133
134
if (ext == "swf")
135
{
136
//m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137
//m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138
//m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139
//m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140
//m_graphicViewerWriter.WriteLine("</object>");
141
}
142
else
143
{
144
// m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145
//img" + ( m_fileId++ ) + "." + ext;
146
//m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147
}
148
}
149
}
150
catch (Exception)
151
{
152
// m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153
}
154
}
现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!
此内容由惯性聚合(RSS阅读器)自动聚合整理,仅供阅读参考。 原文来自 — 版权归原作者所有。