currently i need to develop an application that can exctract information from few website..
this is what i have done up until now..
Imports System
Imports System.Text.RegularExpressions
Imports System.IO
Imports System.Net
Imports System.Web
Imports System.Data.SqlClient
Imports System.Threading
Imports System.Data.DataSet
Imports System.Data.OleDb
Module module1
Dim url As String
Dim hotelName As String = ""
Sub Main()
Dim url As String = ""
Console.Write("enter url: ")
url = Console.ReadLine()
extractor(url)
End Sub
Public Sub extractor(ByVal url As String)
Dim strConn As String = "Data Source = localhost; Initial Catalog = knowledgeBase; Integrated Security = True; Connection Timeout = 0;"
Dim conn As SqlConnection = New SqlConnection(strConn)
conn.Open()
Dim strSQL1 As String
Dim matchStn1 As String = ""
Dim matchstn2 As String = ""
Dim matchstn3 As String = ""
Dim matchstn4 As String = ""
Dim matchstn5 As String = ""
Dim matchstn6 As String = ""
Dim matchstn7 As String = ""
Dim matchstn8 As String = ""
Dim matchstn9 As String = ""
Dim matchstn10 As String = ""
Dim objRequest As WebRequest = HttpWebRequest.Create(url)
Dim objResponse As WebResponse = objRequest.GetResponse()
Dim objStreamReader As New StreamReader(objResponse.GetResponseStream())
Dim strpage As String = objStreamReader.ReadToEnd
Dim RegExStr As String = "<[^>]*>"
Dim R As New Regex(RegExStr)
Dim sourcestring As String = strpage
Dim re As Regex = New Regex("<h2 class=""name hotel""[^>]*>[\s\S]+?</h2>")
Dim mc As MatchCollection = re.Matches(sourcestring)
Dim mIdx As Integer = 0
For Each m As Match In mc
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchStn1 = m.Groups(groupIdx).Value
matchStn1 = R.Replace(matchStn1, " ")
matchStn1 = matchStn1.Trim()
Next
mIdx = mIdx + 1
Next
Dim re9 As Regex = New Regex("<li class=""cuisine""[^>]*>[^>]+</li>")
Dim mc9 As MatchCollection = re9.Matches(sourcestring)
Dim mIdx9 As Integer = 0
For Each m As Match In mc9
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn9 = m.Groups(groupIdx).Value
matchstn9 = R.Replace(matchstn9, " ")
matchstn9 = matchstn9.Trim()
Next
mIdx = mIdx + 1
Next
Dim re2 As Regex = New Regex("<span class=""street-address""[^>]*>[^>]+</span>")
Dim mc2 As MatchCollection = re2.Matches(sourcestring)
Dim mIdx2 As Integer = 0
For Each m As Match In mc2
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn2 = m.Groups(groupIdx).Value
matchstn2 = R.Replace(matchstn2, " ")
matchstn2 = matchstn2.Trim()
Next
mIdx2 = mIdx2 + 1
Next
Dim re3 As Regex = New Regex("<span class=""locality""[^>]*>[\s\S]+?</span>")
Dim mc3 As MatchCollection = re3.Matches(sourcestring)
Dim mIdx3 As Integer = 0
For Each m As Match In mc3
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn3 = m.Groups(groupIdx).Value
matchstn3 = R.Replace(matchstn3, " ")
matchstn3 = matchstn3.Trim()
Next
mIdx3 = mIdx3 + 1
Next
Dim re4 As Regex = New Regex("<span property=""v:postal-code""[^>]*>[\s\S]+?</span>")
Dim mc4 As MatchCollection = re4.Matches(sourcestring)
Dim mIdx4 As Integer = 0
For Each m As Match In mc4
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn4 = m.Groups(groupIdx).Value
matchstn4 = R.Replace(matchstn4, " ")
matchstn4 = matchstn4.Trim()
Next
mIdx4 = mIdx4 + 1
Next
Dim re5 As Regex = New Regex("<span class=""country-name""[^>]*>[\s\S]+?</span>")
Dim mc5 As MatchCollection = re5.Matches(sourcestring)
Dim mIdx5 As Integer = 0
For Each m As Match In mc5
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn5 = m.Groups(groupIdx).Value
matchstn5 = R.Replace(matchstn5, " ")
matchstn5 = matchstn5.Trim()
Next
mIdx5 = mIdx5 + 1
Next
Dim re10 As Regex = New Regex("<address class=""adr""[^>]*>[\s\S]+?</address>")
Dim mc10 As MatchCollection = re10.Matches(sourcestring)
Dim mIdx10 As Integer = 0
For Each m As Match In mc10
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn10 = m.Groups(groupIdx).Value
matchstn10 = R.Replace(matchstn10, " ")
matchstn10 = matchstn10.Trim()
strSQL1 = "insert into infoRestaurant (nameRestaurant, cuisine, streetAddress, locality, postalCode, countryName, addressFull, tel, attractionType) values (N" & _
FormatSqlParam(matchStn1) & ",N" & _
FormatSqlParam(matchstn9) & ",N" & _
FormatSqlParam(matchstn2) & ",N" & _
FormatSqlParam(matchstn3) & ",N" & _
FormatSqlParam(matchstn4) & ",N" & _
FormatSqlParam(matchstn5) & ",N" & _
FormatSqlParam(matchstn10) & ",N" & _
FormatSqlParam(matchstn6) & ",N" & _
FormatSqlParam(matchstn7) & ")"
Dim objCommand1 As New SqlCommand(strSQL1, conn)
objCommand1.ExecuteNonQuery()
Next
mIdx4 = mIdx4 + 1
Next
Dim re6 As Regex = New Regex("<span class=""tel""[^>]*>[\s\S]+?</span>")
Dim mc6 As MatchCollection = re6.Matches(sourcestring)
Dim mIdx6 As Integer = 0
For Each m As Match In mc6
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn6 = m.Groups(groupIdx).Value
matchstn6 = R.Replace(matchstn6, " ")
matchstn6 = matchstn6.Trim()
Next
mIdx6 = mIdx6 + 1
Next
Dim re7 As Regex = New Regex("<div><b>Attraction type:[^>]*>[\s\S]+?</div>")
Dim mc7 As MatchCollection = re7.Matches(sourcestring)
Dim mIdx7 As Integer = 0
For Each m As Match In mc7
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn7 = m.Groups(groupIdx).Value
matchstn7 = R.Replace(matchstn7, " ")
matchstn7 = matchstn7.Trim()
Next
mIdx7 = mIdx7 + 1
Next
Dim re8 As Regex = New Regex("(?=<p id).*(?<=</p>)")
Dim mc8 As MatchCollection = re8.Matches(sourcestring)
Dim mIdx8 As Integer = 0
For Each m As Match In mc8
For groupIdx As Integer = 0 To m.Groups.Count - 1
matchstn8 = m.Groups(groupIdx).Value
matchstn8 = R.Replace(matchstn8, " ")
matchstn8 = matchstn8.Trim()
Dim strSQL2 As String = "insert into feedBackRestaurant (feedBackView) values(N" + FormatSqlParam(matchstn8) + ")"
Dim objCommand2 As New SqlCommand(strSQL2, conn)
objCommand2.ExecuteNonQuery()
Next
mIdx8 = mIdx8 + 1
Next
objStreamReader.Close()
conn.Close()
End Sub
Public Function FormatSqlParam(ByVal strParam As String) As String
Dim newParamFormat As String
If strParam = String.Empty Then
newParamFormat = "'" & "NA" & "'"
Else
newParamFormat = strParam.Trim()
newParamFormat = "'" & newParamFormat.Replace("'", "''") & "'"
End If
Return newParamFormat
End Function
End Module
---problems--
problem that i face are
1. the database foreign key is not working here..someone told me that need some codes to be added..but i dunno how.
2. the data repeats as i run the application. i guest it require update database function.but i hv no idea how.
3. i have to add in multithreading function as well..and last, how to make my application is flexible eventhough the HTML code changes..can anyone help me??plzzz
website that i need to extract is http://www.tripadvisor.com/Tourism-g293951-Malaysia-Vacations.html i need the information about hotel, restaurant and attraction place..plzz..i need some help here..