[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

VMs: VBScript for finding repeating strings



I've been playing around with this VBscript. It looks for repeated strings
in a document that are longer than a set minimum. I have run it on a text
that I know well. It looks as if repeating strings are quite significant for
the story - at least I recognize their relevance. I have not tried it on the
VMS yet, I want to get a feel for what it does first.

Feel free to play around with it and I'm curious about your comments. It
should run on any WIndows system "as is". Just click the file.

NB: I fully expect that I've been reinventing the wheel :-)

NB: It is - as yet - horrendously inefficient. It uses N^2 operations for
analysis of a text of N characters. But my time is more valuable than the
computer's - isn't it? And the computer works while I sleep :-)

'---------------------------------------------------------------------------
------------
'
' Name:   repeat_strings.vbs
' Version:  0.1
' Date:   17 may 2004
' Author:   Petr Kazil
' Description: Searches for repeated strings in a text file
'
' Contact:  pklist01@xxxxxxxxx
'
'---------------------------------------------------------------------------
------------

Option Explicit

Const ForReading = 1

Dim inputFile : inputFile  = ""
Dim outputFile  : outputFile = "repeated_strings.txt"
Dim lineNumber  : lineNumber = 0

Dim objFSO  : objFSO  = Null
Dim objFile  : objFile  = Null
Dim line   : line   = ""

Dim longString()

Dim WshShell : WshShell = Null

Dim Junk, objFileOut

Dim length, length1, length2

Dim i,j, distance, match, matchString, minLength, matchPlace1, matchPlace2

'---------------------------------------------------------------------------
------------
' Open input file
'---------------------------------------------------------------------------
------------

inputFile = Inputbox( _
   "Search for repeat_strings" & _
                      chr(10) & "pklist01@xxxxxxxxx" & _
                      chr(10) & chr(10) & _
   "Input file:", "D:\Home\Input_file.txt")

If inputFile = "" Then
 Wscript.Quit
End If

Set objFSO = CreateObject("Scripting.FileSystemObject")

If objFSO.FileExists(inputFile) Then
 Set objFile = objFSO.OpenTextFile(inputFile, ForReading)
Else
 Wscript.Echo "Error : No input file : " & inputFile
 Wscript.Quit
End If

'---------------------------------------------------------------------------
------------
' Create output file
'---------------------------------------------------------------------------
------------

If objFSO.FileExists(outputFile) Then objFSO.DeleteFile outputFile , True
Set objFileOut = objFSO.CreateTextFile(outputFile, True, False)

'---------------------------------------------------------------------------
------------
' Read the file and put it in one long array
' Replace end-of-lines with spaces
'---------------------------------------------------------------------------
------------

lineNumber = 0
length1 = 0

Do While objFile.AtEndOfStream = False
 line = objFile.ReadLine
 line = line & " " 'Replace end of line by space
 lineNumber = lineNumber + 1
 length2 = Len(line)
 ReDim Preserve longString (length1 + length2)
 for i = 0 to length2 - 1
  longstring (length1 + i) = Mid(line, i+1, 1)  ' Strings run from 1 to N
 next
 length1 = Ubound(longString)+1
Loop

length = Ubound(longString) + 1

Set WshShell = CreateObject("WScript.Shell")

Junk = Msgbox ("Input file: " & vbTab & inputFile & vbNewLine & _
         "Lines read: " & vbTab & lineNumber & vbNewLine & _
         "length    : " & vbTab & length )

'---------------------------------------------------------------------------
------------
' Compare characters a certain distance apart
' Start at difference 1 and continue until difference length-1
'---------------------------------------------------------------------------
------------

match = 0
minLength = 12
matchString = ""

For distance = 1 To Ubound(longString)-1
 For j = 0 To Ubound(longString) - distance
  If longString(j) = longString(j+distance) Then
   If match = 1 Then
    matchString = matchString & longString(j)
   Else
    matchString = longString(j)
    match = 1
    matchPlace1 = j
    matchPlace2 = j+distance
   End If
  Else
   match = 0
   If Len(matchString) > minLength Then
    objFileOut.Writeline matchPlace1 & vbTab & matchPlace2 & vbTab & _
    Len(matchString) & vbTab & "|" & matchString & "|"
    matchString = ""
   End If
  End If
 next
next

'---------------------------------------------------------------------------
------------
' End program
'---------------------------------------------------------------------------
------------

______________________________________________________________________
To unsubscribe, send mail to majordomo@xxxxxxxxxxx with a body saying:
unsubscribe vms-list