Parsing tables, cells with Html agility in C#
- by Kaeso
I need to parse Html code. More specifically, parse each cell of every rows in all tables. Each row represent a single object and each cell represent different properties. I want to parse these to be able to write an XML file with every data inside (without the useless HTML code). This is the way I thought it out initially but I ran out of ideas:
HTML:
<tr>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF">
1
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left">
<a href="/ice/player.htm?id=8471675">Sidney Crosby</a>
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center">
PIT
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center">
C
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
39
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
32
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
33
</td>
<td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right">
<font color="#000000">
65
</font>
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
20
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
29
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
10
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
1
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
3
</td>
<td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right">
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
0
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
154
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
20.8
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
21:54
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
22.6
</td>
<td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right">
55.7
</td>
</tr>
C#:
using HtmlAgilityPack;
using System.Data;
namespace Stats
{
class StatsParser
{
private string htmlCode;
private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";
public StatsParser(string htmlCode)
{
this.htmlCode = htmlCode;
this.ParseHtml();
}
public DataTable ParseHtml()
{
var result = new DataTable();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlCode);
HtmlNode row = doc.DocumentNode.SelectNodes("//tr");
foreach (var statBox in row.SelectNodes("//td[@class='statBox']"))
{
System.Windows.MessageBox.Show(statBox.InnerText);
}
}
}
}