use Getopt::Long; use English; use DB_File; use Fcntl; use Lingua::Stem qw(stem); #use Time::HiRes qw( usleep ualarm gettimeofday tv_interval ); use Time::HiRes( gettimeofday ); open( FILE , "stopwords.txt"); while(){ chomp ; push( @{ $stopword[ ord( substr($_,0,1) ) - 97 ] } , "$_") ; } close FILE; sub clear_stopword{ my $word = shift ; my $first_char = substr( $word , 0 , 1); my $match = 0 ; if( ord($first_char) < 123 && ord($first_char) > 96 ){ foreach ( @{ $stopword[ ord( $first_char ) - 97 ]} ){ if( $word eq $_ ){ $match = 1 ; } } } if ($match){ return ""; }else{ return $word ; } } #open datebase; my %index , %index2, %index3, %index4; $sdocno = "aaaa"; $database = 'aa'; $database1 = join('.', $database, 'dbm'); #$database1 = join('.', $database, 'dbm'); $database2 = "./docno_filename.dbm"; $database3 = "./docno_reldocno.dbm"; tie %index, 'DB_File', $database1, O_RDWR|O_CREAT, 0777, $DB_BTREE || die "DBM 2 not open $!\n"; tie %index2, 'DB_File', $database2, O_RDWR|O_CREAT, 0777, $DB_BTREE || die "DBM 2 not open $!\n"; tie %index3, 'DB_File', $database3, O_RDWR|O_CREAT, 0777, $DB_BTREE || die "DBM 3 not open $!\n"; my $doc_count = 1 ; open( TIME_FILE , ">>time2.log"); foreach $file_name( <../FBIS/*> ){ #foreach $file_name( <./FBIS/*> ){ open( FILE , "$file_name" ); my ($bg_sec , $bg_msec ) = gettimeofday ; while( ){ chomp; if( // ){ ( $temp , $doc_num , $temp ) = split( " " , $_ ) ; $index2{$sdocno} = "$file_name"; $index3{$sdocno} = "$doc_num"; while( ){ chomp ; if( /<\/DOC>/ ){ for $number ( sort keys %index4 ){ $index{$number} = "$index{$number}|$sdocno-$index4{$number}"; } %index4 = {}; $sdocno++; last; } next if( /<.*>/ ) ; $_ =~ s/[\.\,\)\(\"\'\:\;\-\&\[\]\{\}\!\?]//g ; $_ =~ s/\d//g; @words = split( " " , $_ ) ; @words = map { lc(clear_stopword($_)) } @words ; @words = @{stem(@words)}; #print "@words\n"; foreach $index_word (@words){ next if( $index_word eq "" ); $cc = $index4{$index_word} ; if( $cc eq "" ){ $index4{$index_word} = 1; } else { $index4{$index_word} = ( $index4{$index_word} + 1 ); } } }#end doc while }#end match docno }#end file while my ($end_sec , $end_msec ) = gettimeofday ; my $sec = $end_sec-$bg_sec ; my $msec = $end_msec-$bg_msec ; if( $msec < 0 ){ $msec = 1000000 + $msec; $sec -- ; } $msec = "0"x(6-length($msec))."$msec"; print "$doc_count $file_name time used : $sec.$msec \n"; print TIME_FILE "$doc_count $file_name time used : $sec.$msec \n"; if( ($doc_count % 30) == 0 ){ untie %index; $database++; $database1 = join('.', $database, 'dbm'); print "count : $doc_count - file : $database1\n"; tie %index, 'DB_File', $database1, O_RDWR|O_CREAT, 0777, $DB_BTREE || die "DBM 2 not open $!\n"; } $doc_count++; close FILE; } close( TIME_FILE ); untie %index ; untie %index2 ; untie %index3 ;