You are on page 1of 8

' ' CreateWordNetwork ' ' Scott A. Golder <scott@redlog.

net> ' ' Social Media Research Foundation ' ' 21 September 2010 ' ' ' ' UPDATES ' ---' - 10/15/2010 ' - stopwords added by Vlad Barash <vdb5@cornell.edu> ' - 1/25/2011 (scott) ' - URL networks ' - replaced ints with longs ' - 2/10/2011 (scott) ' - added multiple options for edge definitions (directed/undirected, suppres s singletons, adjacent/co-tweeted) ' - bit.ly links in URL networks are no longer broken due to stripping of cap italization ' - 2/12/2012 (scott) ' - added attributes for keyword edges ' ' BUGS ' ---' - hashtags often have punctuation at the end of them. this needs to be strippe d ' - row processing stops when an empty cell is hit. This is undesirable, but nee d a better method of detecting the end of the worksheet Option Explicit ' globals Dim edgelist, letters As Dictionary Dim stoparray() As String Public Const STOPWORDS_FILE As String = "C:\Users\Marc A. Smith\Documents\My Dro pbox\_NodeXL\common-english-words.txt"

' given the name of a workbook, seee if that workbook exists Function check_exists(nm As String) Dim b As Boolean Dim sh As Worksheet b = False For Each sh In Worksheets If sh.Name = nm Then b = True Next check_exists = b End Function Function DoesStringExistInArray(StringToCheckFor As String, ArrayToCheck() As St

ring) As Boolean ' Function for checking if string is in array ' MsgBox ("string to check: " + StringToCheckFor) On Error GoTo DoesStringExistInArrayError Dim LB As Long, UB As Long, LoopCnt As Long 'Check to see if the array is a dynamic array that has not been dimensioned yet ' If Not ArrayToCheck Then Exit Function 'no need to see function to false as a function is by default = to false 'okay, if we are here then the array has been dimensioned, so retrieve its bound s LB = LBound(ArrayToCheck) UB = UBound(ArrayToCheck) ' MsgBox ("bounds: " + CStr(LB) + " " + CStr(UB)) 'now loop through array to see if string exists... For LoopCnt = LB To UB ' MsgBox ("current array element: " + ArrayToCheck(LoopCnt)) If StringToCheckFor = ArrayToCheck(LoopCnt) Then DoesStringExistInArray = True Exit Function End If Next LoopCnt Exit Function DoesStringExistInArrayError: MsgBox "FormModuleName.DoesStringExistInArray " & Err.Number & ":" & Err.Descrip tion End Function Sub ExtractURLsAndHashtagsFromTweets() Dim sheet_name As String sheet_name = InputBox("Please choose a sheet. For NodeXL workbooks, usually Edge s.", "Choose a Worksheet", "Edges") Dim col As Long col = GetColumnNumber(InputBox("Please choose a source column (A-ZZ)." & Chr$(10 ) & "For tweets, it's column R.", "Choose a column", "R")) Dim urlcol As Long urlcol = GetColumnNumber(InputBox("Please choose a column to store URLs (A-ZZ)." , "Choose a column", "AF")) Dim htcol As Long htcol = GetColumnNumber(InputBox("Please choose a column to store #hashtags (A-Z Z).", "Choose a column", "AJ")) Dim curr_row As Long curr_row = InputBox("Choose a start row (usually 3 for NodeXL workbooks)", "Choo se a row", "3") 'MsgBox "Map " & col & " to " & newcol Set wksht = Sheets(sheet_name)

While Len(wksht.Cells(curr_row, col).Text) > 0 txt = wksht.Cells(curr_row, col) Dim substr As String substr = "" Dim httpidx As Long httpidx = InStr(txt, "http://") If (httpidx > 0) Then Dim slash As Long slash = InStr(httpidx + 7, txt, " ") If (slash > 0) Then substr = Mid(txt, httpidx, slash - httpidx) Else substr = Mid(txt, httpidx) End If wksht.Cells(curr_row, urlcol) = substr End If substr = "" Dim htidx As Long htidx = InStr(txt, "#") If (htidx > 0) Then Dim sp As Long sp = InStr(htidx + 1, If (sp > 0) Then substr = Mid(txt, Else substr = Mid(txt, End If wksht.Cells(curr_row, End If curr_row = curr_row + 1 Wend MsgBox "done" End Sub Function GetColumnNumber(col As String) As Long ' requires Microsoft Scripting Runtime (Tools -> References ... ) Set letters = New Dictionary letters("A") = 1 letters("B") = 2 letters("C") = 3 letters("D") = 4 letters("E") = 5 letters("F") = 6 letters("G") = 7 letters("H") = 8 letters("I") = 9 letters("J") = 10 letters("K") = 11 letters("L") = 12

txt, " ") htidx, sp - htidx) htidx) htcol) = substr

letters("M") letters("N") letters("O") letters("P") letters("Q") letters("R") letters("S") letters("T") letters("U") letters("V") letters("W") letters("X") letters("Y") letters("Z")

= = = = = = = = = = = = = =

13 14 15 16 17 18 19 20 21 22 23 24 25 26

Dim l1, l2 As String Dim c1, c2 As Integer Dim cidx As Long cidx = -1 col = UCase(col) If (Len(col) = 1) Then l1 = Left(col, 1) c1 = letters(l1) If (c1 <= 0) Then MsgBox "input is malformed" End End If cidx = c1 ElseIf (Len(col) = 2) Then l1 = Left(col, 1) l2 = Right(Left(col, 2), 1) c1 = letters(l1) c2 = letters(l2) If (c1 <= 0 Or c2 <= 0) Then MsgBox "input is malformed" End End If cidx = (c1 * 26) + c2 Else MsgBox "input is malformed" End End If GetColumnNumber = cidx End Function ' the main macro sub Sub CreateWordNetwork() Dim option_adj As Integer Dim option_direct As Integer Dim option_suppress_singles As Integer

' the edgelist will hold all the undirected (but alphabetized) edges Set edgelist = New Dictionary ' ' name the new worksheet. e.g. if "Word Network" exists then create "Word Networ k 1" or "Word Network 2" ' Dim worksheet_basename As String Dim worksheet_name As String worksheet_basename = "Word Network" worksheet_name = "Word Network" Dim wsctr As Long wsctr = 0 While check_exists(worksheet_name) wsctr = wsctr + 1 worksheet_name = worksheet_basename & " " & wsctr Wend ' ' get list of stopwords from file, store in memory ' Dim stopline As String Open STOPWORDS_FILE For Input As #1 While Not EOF(1) Line Input #1, stopline Wend Close #1 stoparray = Split(stopline, ",") ' ' figure out which column the user wants to process ' Dim sheet_name As String sheet_name = InputBox("Please choose a sheet. For NodeXL workbooks, usually Edge s.", "Choose a Worksheet", "Edges") Dim col As String col = InputBox("Please choose a column (A-ZZ)." & Chr$(10) & "For tweets, it's c olumn R.", "Choose a column", "R") col = UCase(col) Dim curr_row As Long curr_row = InputBox("Choose a start row (usually 3 for NodeXL workbooks)", "Choo se a row", "3") Dim cidx As Long cidx = GetColumnNumber(col) Dim attrib_col As Long attrib_col = -1 Dim attrib_col_str As String attrib_col_str = InputBox("Enter the column for edge attributes (blank if none). ", "Define edge attribute", "Q") If Len(attrib_col_str) > 0 Then attrib_col = GetColumnNumber(attrib_col_str) End If

option_suppress_singles = MsgBox("Would you like to skip edges that occur only o nce?", vbYesNo, "Suppress singletons?") option_adj = MsgBox("Would you like edges to consist of" & Chr$(10) & "adjacent words or co-tweeted words?", vbYesNo, "Adjacent words?") If option_adj = vbYes Then option_direct = MsgBox("Would you like directed or undirected edges?", vbYes No, "Directed edges?") Else MsgBox ("Note: For co-tweeted word networks all edges are undirected.") End If ' MsgBox ( _ "vbYes = " & vbYes & Chr$(10) & "vbNo = " & vbNo & Chr$(10) _ & "suppress = " & option_suppress_singles & Chr$(10) _ & "adjacent = " & option_adj & Chr$(10) _ & "directed = " & option_direct & Chr$(10) _ ) ' ' iterate down the column. Keep going til you run out of rows ' Dim wksht As Worksheet Set wksht = Sheets(sheet_name) While Len(wksht.Cells(curr_row, cidx).Text) > 0 Dim txt As String txt = wksht.Cells(curr_row, cidx) Dim attrib_string As String attrib_string = "" If attrib_col > 0 Then attrib_string = "" & Chr$(8) & wksht.Cells(curr_row, attrib_col).Text End If ' requires Microsoft VBScript Regular Expressions library txt = clean_txt(txt) Dim arr As Variant arr = Split(txt, " ") Dim i, j As Long Dim s1, s2, tmp As String If option_adj = vbYes Then For i = LBound(arr) To UBound(arr) - 1 s1 = arr(i) s2 = arr(i + 1) If option_direct = vbNo Then ' alphabetize If s1 > s2 Then tmp = s1 s1 = s2 s2 = tmp End If End If add_edge s1, s2, attrib_string Next i Else ' all pairs For i = LBound(arr) To UBound(arr) - 1 For j = i + 1 To UBound(arr)

s1 = arr(i) s2 = arr(j) ' always alphabetize since there isn't an ordering and so all ed ges are undirected If s1 > s2 Then tmp = s1 s1 = s2 s2 = tmp End If add_edge s1, s2, attrib_string Next j Next i End If curr_row = curr_row + 1 Wend ' ' create the new worksheet ' Dim new_worksheet As Worksheet Set new_worksheet = Sheets.Add new_worksheet.Name = worksheet_name new_worksheet.Cells(1, 1) = "V1" new_worksheet.Cells(1, 2) = "V2" new_worksheet.Cells(1, 3) = "WEIGHT" If attrib_col > 0 Then new_worksheet.Cells(1, 4) = "ATTRIBUTE" End If ' ' add each new edge to the new edgelist worksheet ' Dim ct As Long ct = edgelist.Count MsgBox "There are " & ct & " new edges. Large numbers of edges may take a lot of time to add to the workbook." Dim key As Variant curr_row = 2 For Each key In edgelist Dim pair() As String pair = Split(key, Chr$(8)) If option_suppress_singles = vbNo Or edgelist(key) > 1 Then new_worksheet.Cells(curr_row, 1) = pair(0) new_worksheet.Cells(curr_row, 2) = pair(1) new_worksheet.Cells(curr_row, 3) = "" & edgelist(key) If attrib_col > 0 Then new_worksheet.Cells(curr_row, 4) = pair(2) End If curr_row = curr_row + 1 End If Next End Sub Function clean_txt(ByVal s As String) As String Dim txt As String

txt = LCase(s) & " " ' remove URLs Dim url_re As New RegExp url_re.Global = True url_re.IgnoreCase = True url_re.Pattern = "http://.*?\s" txt = url_re.Replace(txt, " ") ' remove non alphanumerics Dim re As New RegExp re.Global = True re.IgnoreCase = False re.Pattern = "[^a-z@#_]" txt = re.Replace(txt, " ") clean_txt = txt End Function Sub add_edge(ByVal s1 As String, ByVal s2 As String, ByVal attrib) Dim k As String Dim tmp As String Dim times As Long If (s1 = s2) Or (Len(s1) < 3) Or (Len(s2) < 3) Or (DoesStringExistInArray( CStr(s1), stoparray) = True) Or (DoesStringExistInArray(CStr(s2), stoparray) = T rue) Then ' If (s1 = s2) Or (Len(s1) = 0) Or (Len(s2) = 0) Then ' do nothing if the word's the same (no self loops) ' also do nothing if either word is a stop word Else ' add the edge to the edgelist, and increment its count if necessary k = s1 & Chr$(8) & s2 & attrib If edgelist.Exists(k) Then times = edgelist(k) times = times + 1 edgelist.Remove k edgelist.Add k, times Else edgelist.Add k, 1 End If End If End Sub ' ' Done. '

You might also like