HttpParsing for hypertext
- by Nani
I am in process of getting all hierarchical links from a given link and validating them;
This is the code I wrote. But I am not feeling it as efficient.
Reasons are:
1.For the non unique links which open same page, code is getting sub-links again and again
2.Is the code getting all links?
3.Is it making valid URLs from the sub-links it derived?
4.May be some other reasons about which I have no idea.
Please suggest me how to make this piece of code efficient .
Thank you.
class Program
{
public static ArrayList sublink = new ArrayList();
public static ArrayList subtitle = new ArrayList();
public static int ini = 0, len_o, len_n, counter = 0;
static void Main(string[] args)
{
// Address of URL
string URL = "http://www.techonthenet.com/";
sublink.Add(URL);
l:
len_o = sublink.Count; len_o);
Console.WriteLine("-------------Level:" + counter++);
for (int i = ini; i < len_o; i++) test(sublink[i].ToString());
len_n = sublink.Count;
if (len_o < len_n) { ini = len_o; goto l; }
Console.ReadKey();
}
//method to get the sub-links
public static void test(string URL)
{
try
{
// Get HTML data
WebClient client = new WebClient();
Stream data = client.OpenRead(URL);
StreamReader reader = new StreamReader(data);
string str = "", htmldata = "", temp;
int n1, n2;
str = reader.ReadLine();
while (str != null)
{
htmldata += str;
str = reader.ReadLine();
}
data.Close();
for (int i = 0; i < htmldata.Length - 5; i++)
{
if (htmldata.Substring(i, 5) == "href=")
{
n1 = htmldata.Substring(i + 6, htmldata.Length - (i + 6)).IndexOf("\"");
temp = htmldata.Substring(i + 6, n1);
if (temp.Length > 4 && temp.Substring(0, 4) != "http")
{
if(temp.Substring(0,1)!="/")
temp=URL.Substring(0,URL.IndexOf(".com/")+5)+temp;
else temp = URL.Substring(0, URL.IndexOf(".com/") + 5) + temp.Remove(0,1);
}
if (temp.Length < 4) temp = URL.Substring(0, URL.IndexOf(".com/") + 5) + temp;
sublink.Add(temp);
n2 = htmldata.Substring(i + n1 + 1, htmldata.Length - (i + n1 + 1)).IndexOf("<");
subtitle.Add(htmldata.Substring(i + 6 + n1 + 2, n2 - 7));
i += temp.Length + htmldata.Substring(i + 6 + n1 + 2, n2 - 7).Length;
}
}
for (int i = len_n; i < sublink.Count; i++) Console.WriteLine(i + "--> " + sublink[i]);
}
catch (WebException exp)
{
Console.WriteLine("URL Could not be Resolved" + URL);
Console.WriteLine(exp.Message, "Exception");
}
}
}