C#简易采集工具
这段时间比较忙一直在搞CentOS,所以很少碰到编程。但是昨天修改以前写的这个采集工具的时候,不小心把GUI版本改坏了,我也不打算修复好的了,今天就索性把它的核心代码放出来,我相信有很多人需要它。
Program.cs 文件[入口文件]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
namespace Gather
{
class Program
{
static void Main(string[] args)
{
Console.Title = "简易采集工具 - A Gather Tool";
Core core = new Core();
bool menu = true;
string inifile = null;
Ini ini = new Ini();
ini.setfile(inifile);
string sign = null;
string page = null;
string log = null;
string test = null;
string suffix = null;
string encoded = null;
string page_url = null;
string test_url = null;
string start = null;
string end = null;
string[] page_rule = new string[5];
string[] page_data = new string[5];
string[] log_rule = new string[15];
string[] log_data = new string[15];
string[] test_rule = new string[5];
string[] test_data = new string[15];
/************************************************************************/
/* 程序循环 */
/************************************************************************/
while (menu)
{
core.copy();
if (inifile == null)
{
Console.Write(" 请按编号输入要执行的操作:\n\n [1] 设置配置文件\n [*] 采集列表页\n [*] 采集文章页\n [*] 采集规则测试\n [0] 退出程序\n\n 编号:");
}
else
{
Console.Write(" 请按编号输入要执行的操作:\n\n [1] 设置配置文件\n [2] 采集列表页\n [3] 采集文章页\n [4] 采集规则测试\n [0] 退出程序\n\n 编号:");
}
try
{
int doit = int.Parse(Console.ReadLine());
switch (doit)
{
/************************************************************/
/* 退出 */
/************************************************************/
case 0:
menu = false;
break;
/************************************************************/
/* 设置配置文件 */
/************************************************************/
case 1:
bool setini = true;
while (setini)
{
core.copy();
Console.Write(" 请输入配置文件的路径,例如:d:\\catch\\fdawn.ini\n 若要使用同目录下的配置文件请直接填写名称,例如:fdawn.ini\n\n 路径:");
inifile = Console.ReadLine();
if (File.Exists(inifile))
{
core.copy();
if (inifile.LastIndexOf(@":\") != 1)
{
inifile = Thread.GetDomain().BaseDirectory + inifile;
}
Console.WriteLine(" 当前设置的配置文件的路径为 {0}", inifile);
Console.ReadKey();
setini = false;
/************************************************/
/* 初始化 */
/************************************************/
ini.setfile(inifile);
sign = ini.readini("setting", "sign");
page = ini.readini("setting", "page");
log = ini.readini("setting", "log");
test = ini.readini("setting", "test");
suffix = ini.readini("setting", "suffix");
encoded = ini.readini("setting", "encoded");
page_url = ini.readini("page", "url");
test_url = ini.readini("test", "url");
start = ini.readini("page", "start");
end = ini.readini("page", "end");
page_rule = new string[5];
page_data = new string[5];
log_rule = new string[15];
log_data = new string[15];
test_rule = new string[15];
test_data = new string[15];
for (int i = 0; i < 5; i++)
{
page_rule[i] = ini.readini("page", "rule" + i);
page_data[i] = ini.readini("page", "data" + i);
}
for (int i = 0; i < 15; i++)
{
log_rule[i] = ini.readini("log", "rule" + i);
log_data[i] = ini.readini("log", "data" + i);
}
for (int i = 0; i < 5; i++)
{
test_rule[i] = ini.readini("test", "rule" + i);
test_data[i] = ini.readini("test", "data" + i);
}
}
else
{
core.copy();
Console.WriteLine(" 配置文件不存在!");
Console.ReadKey();
}
}
break;
/************************************************************/
/* 采集列表页 */
/************************************************************/
case 2:
if (inifile == null)
{
core.copy();
Console.WriteLine(" 请先设置配置文件!");
Console.ReadKey();
break;
}
else
{
core.copy();
try
{
Gather box = new Gather(page_url, sign);
box.page(start, end, page_rule, page_data, page, suffix, encoded); //调用函数
}
catch
{
core.copy();
Console.WriteLine(" 采集规则出现错误或网络不稳定!");
}
Console.ReadKey();
}
break;
/************************************************************/
/* 采集文章页 */
/************************************************************/
case 3:
if (inifile == null)
{
core.copy();
Console.WriteLine(" 请先设置配置文件!");
Console.ReadKey();
break;
}
else
{
core.copy();
try
{
Gather box = new Gather(page_url, sign);
box.log(log_rule, log_data, page + "-last" + suffix, log, suffix, encoded); //调用函数
}
catch
{
core.copy();
Console.WriteLine(" 采集规则出现错误或网络不稳定!");
}
Console.ReadKey();
}
break;
/************************************************************/
/* 采集规则测试 */
/************************************************************/
case 4:
if (inifile == null)
{
core.copy();
Console.WriteLine(" 请先设置配置文件!");
Console.ReadKey();
break;
}
else
{
core.copy();
try
{
Gather box = new Gather(test_url);
box.test(test_rule, test_data, test, suffix, encoded); //调用函数
}
catch
{
core.copy();
Console.WriteLine(" 采集规则出现错误或网络不稳定!");
}
Console.ReadKey();
}
break;
/************************************************************/
/* 错误提示 */
/************************************************************/
default:
core.copy();
Console.WriteLine(" 无效的编号!");
Console.ReadKey();
break;
}
}
catch
{
core.copy();
Console.WriteLine(" 无效的编号!");
Console.ReadKey();
}
}
core.copy();
Console.WriteLine(" 再见!");
Console.ReadKey();
}
}
}
Ini.cs 文件[Ini类]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
using System.IO;
namespace Gather
{
class Ini
{
public string filepath;
[DllImport("kernel32")]
private static extern long WritePrivateProfileString(string section, string key, string val, string filePath);
[DllImport("kernel32")]
private static extern int GetPrivateProfileString(string section, string key, string def, StringBuilder retval, int size, string filePath);
public void setfile(string filepath)
{
this.filepath = filepath;
}
public void writeini(string section, string key, string value)
{
WritePrivateProfileString(section, key, value, filepath);
}
public string readini(string section, string key)
{
StringBuilder temp = new StringBuilder(500);
int i = GetPrivateProfileString(section, key, "", temp, 500, filepath);
return temp.ToString();
}
}
}
Gather.cs 文件[采集类]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Collections;
namespace Gather
{
class Gather:Core
{
private string page_url, test_url, sign;
public Gather(string page_url, string sign)
{
this.page_url = page_url;
this.sign = sign;
}
public Gather(string test_url)
{
this.test_url = test_url;
}
private void clearfile(string filename)
{
StreamWriter sw = new StreamWriter(filename);
sw.Write("");
sw.Close();
}
private void writefile(string str, string filename)
{
StreamWriter sw = File.AppendText(filename);
sw.Write(str);
sw.Close();
}
private string[] readfile(string filename, string encode)
{
StreamReader sr = new StreamReader(filename, Encoding.GetEncoding(encode)); //Encoding.Default
string line;
ArrayList list = new ArrayList();
while ((line = sr.ReadLine()) != null)
{
list.Add(line.ToString());
}
string[] array = new string[list.Count];
for (int i = 0; i < list.Count; i++)
{
array[i] = list[i].ToString();
}
sr.Close();
return array;
}
public MatchCollection initialise(string regular, string url, string encode, string page, string sign)
{
string realurl = url.Replace(sign, page);
string box = link(realurl, encode);
return replace(regular, box);
}
/************************************************************************/
/* 初始化用于test */
/************************************************************************/
public MatchCollection initialise(string regular, string url, string encode)
{
string box = link(url, encode);
return replace(regular, box);
}
private MatchCollection replace(string regular, string main)
{
Regex catch_main = new Regex(regular, RegexOptions.IgnoreCase);
MatchCollection main_array = catch_main.Matches(main);
return main_array;
}
private string link(string url, string encode)
{
string strMsg = string.Empty;
try
{
WebRequest request = WebRequest.Create(url);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(encode));
strMsg = reader.ReadToEnd();
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}
public void page(string start, string end, string[] page_rule, string[] page_data, string prefix, string suffix, string encode)
{
Console.WriteLine(" 正在执行采集任务!请等待...");
int file_max = 0, rule_max = 0;
for (int num = int.Parse(start); num < int.Parse(end) + 1; num++)
{
MatchCollection[] page_data_box = new MatchCollection[5];
page_data_box[0] = this.initialise(page_rule[0], page_url, encode, num.ToString(), sign);
/************************************************************************/
/* 抓取并生成缓存 */
/************************************************************************/
bool goon = true;
this.clearfile(prefix + "-" + num + "-0" + suffix);
while (goon)
{
string empty = null;
for (int j = 0; j < page_data_box[0].Count; j++)
{
empty += page_data_box[0][j].Value;
}
if (empty.Length > 5)
{
for (int j = 0; j < page_data_box[0].Count; j++)
{
this.writefile(page_data_box[0][j].Value, prefix + "-" + num + "-0" + suffix);
}
goon = false;
}
}
/************************************************************************/
/* 使用缓存与规则 */
/************************************************************************/
for (int i = 1; i < 5; i++)
{
if (page_rule[i] != "NULL" && page_rule[i] != null && page_data[i] != "NULL" && page_data[i] != null)
{
this.clearfile(prefix + "-" + num + "-" + i + suffix);
string[] array = this.readfile(prefix + "-" + num + "-" + page_data[i] + suffix, encode);
string tmp = "";
foreach (string str in array)
{
tmp += str;
}
page_data_box[i] = this.replace(page_rule[i], tmp);
for (int j = 0; j < page_data_box[i].Count; j++)
{
this.writefile(page_data_box[i][j].Value + "\n", prefix + "-" + num + "-" + i + suffix);
}
rule_max ++; //获取最大的规则数
}
}
file_max ++; //获取最大的文件数
}
/************************************************************************/
/* 合并使用处理后的内容 */
/************************************************************************/
this.clearfile(prefix + "-last" + suffix);
for (int i = 1; i < file_max + 1; i++)
{
string[] array = this.readfile(prefix + "-" + i + "-" + (rule_max / file_max) + suffix, encode);
for (int j = 0; j < array.Length; j++)
{
this.writefile(array[j] + "\n", prefix + "-last" + suffix);
}
}
this.copy();
Console.WriteLine(" 采集任务已执行完成!");
Console.ReadKey();
}
public void log(string[] log_rule, string[] log_data, string filename, string prefix, string suffix, string encode)
{
Console.WriteLine(" 正在执行采集任务!请等待...");
MatchCollection[] log_data_box = new MatchCollection[15];
string[] log_url = this.readfile(filename, encode);
/************************************************************************/
/* 抓取并生成缓存 */
/************************************************************************/
for (int i = 0; i < log_url.Length; i++)
{
bool goon = true;
this.clearfile(prefix + "-" + i + "-0" + suffix);
while (goon)
{
string tmp = this.link(log_url[i], encode);
if (tmp.Length > 5)
{
this.writefile(tmp, prefix + "-" + i + "-0" + suffix);
goon = false;
tmp = null;
}
}
}
/************************************************************************/
/* 使用缓存与规则 */
/************************************************************************/
for (int num = 0; num < log_url.Length; num++)
{
for (int i = 1; i < 15; i++)
{
if (log_rule[i] != "NULL" && log_rule[i] != null && log_data[i] != "NULL" && log_data[i] != null)
{
this.clearfile(prefix + "-" + num + "-" + i + suffix);
string[] array = this.readfile(prefix + "-" + num + "-" + log_data[i] + suffix, encode);
string tmp = "";
foreach (string str in array)
{
tmp += str;
}
log_data_box[i] = this.replace(log_rule[i], tmp);
for (int j = 0; j < log_data_box[i].Count; j++)
{
this.writefile(log_data_box[i][j].Value + "\n", prefix + "-" + num + "-" + i + suffix);
}
}
}
}
this.copy();
Console.WriteLine(" 采集任务已执行完成!");
Console.ReadKey();
}
public void test(string[] test_rule, string[] test_data, string prefix, string suffix, string encode)
{
this.copy();
Console.WriteLine(" 正在执行采集任务!请等待...");
MatchCollection[] test_data_box = new MatchCollection[5];
test_data_box[0] = this.initialise(test_rule[0], test_url, encode);
/************************************************************************/
/* 抓取并生成缓存 */
/************************************************************************/
bool goon = true;
this.clearfile(prefix + "-0" + suffix);
while (goon)
{
string empty = null;
for (int j = 0; j < test_data_box[0].Count; j++)
{
empty += test_data_box[0][j].Value;
}
if (empty.Length > 5)
{
for (int j = 0; j < test_data_box[0].Count; j++)
{
this.writefile(test_data_box[0][j].Value, prefix + "-0" + suffix);
}
goon = false;
}
}
/************************************************************************/
/* 使用缓存与规则 */
/************************************************************************/
for (int i = 1; i < 5; i++)
{
if (test_rule[i] != "NULL" && test_rule[i] != null && test_data[i] != "NULL" && test_data[i] != null)
{
this.clearfile(prefix + "-" + i + suffix);
string[] array = this.readfile(prefix + "-" + test_data[i] + suffix, encode);
string tmp = "";
foreach (string str in array)
{
tmp += str;
}
test_data_box[i] = this.replace(test_rule[i], tmp);
for (int j = 0; j < test_data_box[i].Count; j++)
{
this.writefile(test_data_box[i][j].Value + "\n", prefix + "-" + i + suffix);
}
}
}
this.copy();
Console.WriteLine(" 采集任务已执行完成!");
Console.ReadKey();
}
}
}
Core.cs 文件[核心类]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
namespace Gather
{
class Core
{
public void copy()
{
Console.Clear();
Console.WriteLine(" --------------------------------------------------------------------------");
Console.WriteLine(" Module: Gather Tool");
Console.WriteLine(" Author: _Gemini.Ki");
Console.WriteLine(" Warning: Ini file need ANSI encoded");
Console.WriteLine(" --------------------------------------------------------------------------");
}
}
}
[如何扩展核心功能]
如要扩展功能,可以在Core.cs文件里面添加。
fdawn.ini 文件[配置文件]
;--------------------------------------------------------------------------
; Module: Gather Tool
; Warning: This Ini file need ANSI encoded
;--------------------------------------------------------------------------
;全局设置 不使用的属性值请设置为“NULL”
[setting]
;通配符
sign = [{*}]
;列表页数据保存的文件名前缀
page = D:\Gather\page\page
;文章页数据保存的文件名前缀
log = D:\Gather\log\log
;测试页数据保存的文件名
test = D:\Gather\test
;数据保存的文件后缀
suffix = .txt
;编码
encoded = utf-8
;列表页设置 从0开始,最大规则数为5,超出无效!
[page]
;列表页URL
url = https://www.nowamagic.net/librarys/veda/channel/ProgrammingLanguage/[{*}]/
;列表页采集开始页数
start = 1
;列表页采集结束页数
end = 3
;规则 rule0为匹配大标签,大标签必须包含其他规则的内容,否则结果为空!
rule0 = <div id="left_content">([\d\D]*?)(?=<div id="sidebar">)
rule1 = (?<=<div class="post_content">)([\d\D]*?)(?=</div>)
rule2 = (?<=href=")([\d\D]*?)(?=")
rule3 = NULL
rule4 = NULL
; 数据源 例如data1 = 1就是rule1使用rule0采集到的数据继续匹配
data0 = 0
data1 = 0
data2 = 1
data3 = NULL
data4 = NULL
;文章页设置 从0开始,最大规则数为15,超出无效!
[log]
;规则 rule0为匹配大标签,大标签必须包含其他规则的内容,否则结果为空!
rule0 = (?<=<div class="fullbox_content">)([\d\D]*?)(?=<div class="fullbox_footer"></div>)
rule1 = (?<=rel="bookmark">)([\d\D]*?)(?=</a>)
rule2 = (?<=<div class="post_info_left">)([\d\D]*?)(?=</div>)
rule3 = (\d{2,4}(.*?)){3}(?= )
rule4 = (?<=<div class="post_content readmood" id="defend_\d+">)([\d\D]*?)(?=<div class="fav_area">)
rule5 = NULL
rule6 = NULL
rule7 = NULL
rule8 = NULL
rule9 = NULL
rule10 = NULL
rule11 = NULL
rule12 = NULL
rule13 = NULL
rule14 = NULL
;数据源
data0 = 0
data1 = 0
data2 = 0
data3 = 2
data4 = 0
data5 = NULL
data6 = NULL
data7 = NULL
data8 = NULL
data9 = NULL
data10 = NULL
data11 = NULL
data12 = NULL
data13 = NULL
data14 = NULL
;采集规则测试设置
[test]
;列表页URL
url = https://www.nowamagic.net/librarys/veda/channel/ProgrammingLanguage/1/
;规则 rule0为匹配大标签,大标签必须包含其他规则的内容,否则结果为空!
rule0 = <div id="left_content">([\d\D]*?)(?=<div id="sidebar">)
rule1 = (?<=<div class="post_content">)([\d\D]*?)(?=</div>)
rule2 = (?<=href=")([\d\D]*?)(?=")
rule3 = NULL
rule4 = NULL
; 数据源 例如data1 = 1就是rule1使用rule0采集到的数据继续匹配
data0 = 0
data1 = 0
data2 = 1
data3 = NULL
data4 = NULL
[一直停留在正在采集]
若采集时一直卡在正在采集,很可能是采集规则错误引起的无限循环匹配,卡住请强制关闭程序修改好规则再执行![乱码解决方法]
把配置文件保存成ANSI编码,而且要在配置文件设置采集页面的编码!