#!/usr/local/bin/perl ########################################################################### ## Copyright (c) 2002 Roger D. Peng ## ## All rights reserved. ## ## Permission is hereby granted, free of charge, to any person ## obtaining a copy of this software and associated documentation ## files (the "Software"), to deal in the Software without ## restriction, including without limitation the rights to use, copy, ## modify, merge, publish, distribute, and/or sell copies of the ## Software, and to permit persons to whom the Software is furnished ## to do so, provided that the above copyright notice(s) and this ## permission notice appear in all copies of the Software and that ## both the above copyright notice(s) and this permission notice ## appear in supporting documentation. ## ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ## EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ## MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ## NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE ## COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ## ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY ## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, ## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ## ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE ## OF THIS SOFTWARE. ## ## Except as contained in this notice, the name of a copyright holder ## shall not be used in advertising or otherwise to promote the sale, ## use or other dealings in this Software without prior written ## authorization of the copyright holder. ## ########################################################################### # Count number of occurances of each word in the keyword file # per block use Getopt::Std; $usage_string = "USAGE: blockcount -k -b [blocksize] [files]"; getopts('k:b:'); die "$usage_string\n" unless @ARGV; $kwordfile = $opt_k; if($opt_b) { $bsize = $opt_b; } else { $bsize = 1700; } open(KWORDFILE, "$kwordfile") or die "Can't open $kwordfile: $!\n"; while($kword = ) { chomp($kword); $kwordhash{$kword} = 1; $kcount{$kword} = 0; } close(KWORDFILE); @kwordarray = keys(%kwordhash); $kwordstr = join(" ", @kwordarray); while($docfile = shift(@ARGV)) { $outfile = $docfile . ".bct"; open(DOCFILE, "$docfile") or die "Can't open $docfile: $!\n"; open(OUTFILE, ">$outfile") or die "Can't open $outfile: $!\n"; print OUTFILE "$kwordstr\n"; $total = 0; $bcount = 0; foreach $kword (sort(keys(%kcount))) { $kcount{$kword} = 0; } while($docline = ) { chomp($docline); $docline =~ s/^\s+//; @tmp = split(/\s+/, $docline); while($word = shift(@tmp)) { if($word =~ /\d+/) { next; } $word =~ tr/A-Z/a-z/; $word =~ s/^(\W+)//; $word =~ s/(\W+)$//; $bcount++; if($kwordhash{$word} == 1) { $kcount{$word}++; } if($bcount == $bsize) { foreach $kword (sort(keys(%kcount))) { print OUTFILE "$kcount{$kword} "; $kcount{$kword} = 0; } print OUTFILE "\n"; $bcount = 0; } } } print "$docfile\n"; close(DOCFILE); close(OUTFILE); }