search.php File Reference

Go to the source code of this file.

Functions

 search_results ()
 matches_text ($num)
 report_matches ()
 end_form ($value)
 readInt ($file)
 readString ($file)
 readHeader ($file)
 computeIndex ($word)
 search ($file, $word, &$statsList)
 combine_results ($results, &$docs)
 filter_results ($docs, &$requiredWords, &$forbiddenWords)
 compare_rank ($a, $b)
 sort_results ($docs, &$sorted)
 report_results (&$docs)
 main ()


Function Documentation

combine_results ( results,
&$  docs 
)

Definition at line 197 of file search.php.

Referenced by main().

00198 {
00199   foreach ($results as $wordInfo)
00200   {
00201     $docsList = &$wordInfo["docs"];
00202     foreach ($docsList as $di)
00203     {
00204       $key=$di["url"];
00205       $rank=$di["rank"];
00206       if (in_array($key, array_keys($docs)))
00207       {
00208         $docs[$key]["rank"]+=$rank;
00209       }
00210       else
00211       {
00212         $docs[$key] = array("url"=>$key,
00213             "name"=>$di["name"],
00214             "rank"=>$rank
00215             );
00216       }
00217       $docs[$key]["words"][] = array(
00218                "word"=>$wordInfo["word"],
00219                "match"=>$wordInfo["match"],
00220                "freq"=>$di["freq"]
00221                );
00222     }
00223   }
00224   return $docs;
00225 }

compare_rank ( a,
b 
)

Definition at line 267 of file search.php.

00268 {
00269   if ($a["rank"] == $b["rank"]) 
00270   {
00271     return 0;
00272   }
00273   return ($a["rank"]>$b["rank"]) ? -1 : 1; 
00274 }

computeIndex ( word  ) 

Definition at line 75 of file search.php.

Referenced by search().

00076 {
00077   // Fast string hashing
00078   //$lword = strtolower($word);
00079   //$l = strlen($lword);
00080   //for ($i=0;$i<$l;$i++)
00081   //{
00082   //  $c = ord($lword{$i});
00083   //  $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff;
00084   //}
00085   //return $v;
00086 
00087   // Simple hashing that allows for substring search
00088   if (strlen($word)<2) return -1;
00089   // high char of the index
00090   $hi = ord($word{0});
00091   if ($hi==0) return -1;
00092   // low char of the index
00093   $lo = ord($word{1});
00094   if ($lo==0) return -1;
00095   // return index
00096   return $hi*256+$lo;
00097 }

end_form ( value  ) 

Definition at line 49 of file search.php.

Referenced by main().

00050 {
00051   echo "            <td><input type=\"text\" name=\"query\" value=\"$value\" size=\"20\" accesskey=\"s\"/></td>\n          </tr>\n        </table>\n      </form>\n    </li>\n  </ul>\n</div>\n";
00052 }

filter_results ( docs,
&$  requiredWords,
&$  forbiddenWords 
)

Definition at line 227 of file search.php.

Referenced by main().

00228 {
00229   $filteredDocs=array();
00230   while (list ($key, $val) = each ($docs)) 
00231   {
00232     $words = &$docs[$key]["words"];
00233     $copy=1; // copy entry by default
00234     if (sizeof($requiredWords)>0)
00235     {
00236       foreach ($requiredWords as $reqWord)
00237       {
00238         $found=0;
00239         foreach ($words as $wordInfo)
00240         { 
00241           $found = $wordInfo["word"]==$reqWord;
00242           if ($found) break;
00243         }
00244         if (!$found) 
00245         {
00246           $copy=0; // document contains none of the required words
00247           break;
00248         }
00249       }
00250     }
00251     if (sizeof($forbiddenWords)>0)
00252     {
00253       foreach ($words as $wordInfo)
00254       {
00255         if (in_array($wordInfo["word"],$forbiddenWords))
00256         {
00257           $copy=0; // document contains a forbidden word
00258           break;
00259         }
00260       }
00261     }
00262     if ($copy) $filteredDocs[$key]=$docs[$key];
00263   }
00264   return $filteredDocs;
00265 }

main ( void   ) 

Definition at line 325 of file search.php.

References combine_results(), end_form(), filter_results(), main(), readHeader(), report_results(), search(), and sort_results().

00326 {
00327   if(strcmp('4.1.0', phpversion()) > 0) 
00328   {
00329     die("Error: PHP version 4.1.0 or above required!");
00330   }
00331   if (!($file=fopen("search.idx","rb"))) 
00332   {
00333     die("Error: Search index file could NOT be opened!");
00334   }
00335   if (readHeader($file)!="DOXS")
00336   {
00337     die("Error: Header of index file is invalid!");
00338   }
00339   $query="";
00340   if (array_key_exists("query", $_GET))
00341   {
00342     $query=$_GET["query"];
00343   }
00344   end_form($query);
00345   echo "&nbsp;\n<div class=\"searchresults\">\n";
00346   $results = array();
00347   $requiredWords = array();
00348   $forbiddenWords = array();
00349   $foundWords = array();
00350   $word=strtok($query," ");
00351   while ($word) // for each word in the search query
00352   {
00353     if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
00354     if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
00355     if (!in_array($word,$foundWords))
00356     {
00357       $foundWords[]=$word;
00358       search($file,strtolower($word),$results);
00359     }
00360     $word=strtok(" ");
00361   }
00362   $docs = array();
00363   combine_results($results,$docs);
00364   // filter out documents with forbidden word or that do not contain
00365   // required words
00366   $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
00367   // sort the results based on rank
00368   $sorted = array();
00369   sort_results($filteredDocs,$sorted);
00370   // report results to the user
00371   report_results($sorted);
00372   echo "</div>\n";
00373   fclose($file);
00374 }

matches_text ( num  ) 

Definition at line 29 of file search.php.

00030 {
00031   if ($num==0)
00032   {
00033     return "Sorry, no documents matching your query.";
00034   }
00035   else if ($num==1)
00036   {
00037     return "Found <b>1</b> document matching your query.";
00038   }
00039   else // $num>1
00040   {
00041     return "Found <b>$num</b> documents matching your query. Showing best matches first.";
00042   }
00043 }

readHeader ( file  ) 

Definition at line 68 of file search.php.

Referenced by main().

00069 {
00070   $header =fgetc($file); $header.=fgetc($file);
00071   $header.=fgetc($file); $header.=fgetc($file);
00072   return $header;
00073 }

readInt ( file  ) 

Definition at line 54 of file search.php.

Referenced by search().

00055 {
00056   $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
00057   $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
00058   return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
00059 }

readString ( file  ) 

Definition at line 61 of file search.php.

Referenced by search().

00062 {
00063   $result="";
00064   while (ord($c=fgetc($file))) $result.=$c;
00065   return $result;
00066 }

report_matches (  ) 

Definition at line 45 of file search.php.

00046 {
00047   return "Matches: ";
00048 }

report_results ( &$  docs  ) 

Definition at line 283 of file search.php.

Referenced by main().

00284 {
00285   echo "<table cellspacing=\"2\">\n";
00286   echo "  <tr>\n";
00287   echo "    <td colspan=\"2\"><h2>".search_results()."</h2></td>\n";
00288   echo "  </tr>\n";
00289   $numDocs = sizeof($docs);
00290   if ($numDocs==0)
00291   {
00292     echo "  <tr>\n";
00293     echo "    <td colspan=\"2\">".matches_text(0)."</td>\n";
00294     echo "  </tr>\n";
00295   }
00296   else
00297   {
00298     echo "  <tr>\n";
00299     echo "    <td colspan=\"2\">".matches_text($numDocs);
00300     echo "\n";
00301     echo "    </td>\n";
00302     echo "  </tr>\n";
00303     $num=1;
00304     foreach ($docs as $doc)
00305     {
00306       echo "  <tr>\n";
00307       echo "    <td align=\"right\">$num.</td>";
00308       echo     "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
00309       echo "  <tr>\n";
00310       echo "    <td></td><td class=\"tiny\">".report_matches()." ";
00311       foreach ($doc["words"] as $wordInfo)
00312       {
00313         $word = $wordInfo["word"];
00314         $matchRight = substr($wordInfo["match"],strlen($word));
00315         echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
00316       }
00317       echo "    </td>\n";
00318       echo "  </tr>\n";
00319       $num++;
00320     }
00321   }
00322   echo "</table>\n";
00323 }

search ( file,
word,
&$  statsList 
)

Definition at line 99 of file search.php.

References computeIndex(), readInt(), and readString().

Referenced by main().

00100 {
00101   $index = computeIndex($word);
00102   if ($index!=-1) // found a valid index
00103   {
00104     fseek($file,$index*4+4); // 4 bytes per entry, skip header
00105     $index = readInt($file);
00106     if ($index) // found words matching the hash key
00107     {
00108       $start=sizeof($statsList);
00109       $count=$start;
00110       fseek($file,$index);
00111       $w = readString($file);
00112       while ($w)
00113       {
00114         $statIdx = readInt($file);
00115         if ($word==substr($w,0,strlen($word)))
00116         { // found word that matches (as substring)
00117           $statsList[$count++]=array(
00118               "word"=>$word,
00119               "match"=>$w,
00120               "index"=>$statIdx,
00121               "full"=>strlen($w)==strlen($word),
00122               "docs"=>array()
00123               );
00124         }
00125         $w = readString($file);
00126       }
00127       $totalHi=0;
00128       $totalFreqHi=0;
00129       $totalFreqLo=0;
00130       for ($count=$start;$count<sizeof($statsList);$count++)
00131       {
00132         $statInfo = &$statsList[$count];
00133         $multiplier = 1;
00134         // whole word matches have a double weight
00135         if ($statInfo["full"]) $multiplier=2;
00136         fseek($file,$statInfo["index"]); 
00137         $numDocs = readInt($file);
00138         $docInfo = array();
00139         // read docs info + occurrence frequency of the word
00140         for ($i=0;$i<$numDocs;$i++)
00141         {
00142           $idx=readInt($file); 
00143           $freq=readInt($file); 
00144           $docInfo[$i]=array("idx"  => $idx,
00145                              "freq" => $freq>>1,
00146                              "rank" => 0.0,
00147                              "hi"   => $freq&1
00148                             );
00149           if ($freq&1) // word occurs in high priority doc
00150           {
00151             $totalHi++;
00152             $totalFreqHi+=$freq*$multiplier;
00153           }
00154           else // word occurs in low priority doc
00155           {
00156             $totalFreqLo+=$freq*$multiplier;
00157           }
00158         }
00159         // read name and url info for the doc
00160         for ($i=0;$i<$numDocs;$i++)
00161         {
00162           fseek($file,$docInfo[$i]["idx"]);
00163           $docInfo[$i]["name"]=readString($file);
00164           $docInfo[$i]["url"]=readString($file);
00165         }
00166         $statInfo["docs"]=$docInfo;
00167       }
00168       $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;
00169       for ($count=$start;$count<sizeof($statsList);$count++)
00170       {
00171         $statInfo = &$statsList[$count];
00172         $multiplier = 1;
00173         // whole word matches have a double weight
00174         if ($statInfo["full"]) $multiplier=2;
00175         for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
00176         {
00177           $docInfo = &$statInfo["docs"];
00178           // compute frequency rank of the word in each doc
00179           $freq=$docInfo[$i]["freq"];
00180           if ($docInfo[$i]["hi"])
00181           {
00182             $statInfo["docs"][$i]["rank"]=
00183               (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;
00184           }
00185           else
00186           {
00187             $statInfo["docs"][$i]["rank"]=
00188               (float)($freq*$multiplier)/$totalFreq;
00189           }
00190         }
00191       }
00192     }
00193   }
00194   return $statsList;
00195 }

search_results (  ) 

Definition at line 24 of file search.php.

00025 {
00026   return "Search Results";
00027 }

sort_results ( docs,
&$  sorted 
)

Definition at line 276 of file search.php.

Referenced by main().

00277 {
00278   $sorted = $docs;
00279   usort($sorted,"compare_rank");
00280   return $sorted;
00281 }


Generated on Sat Dec 15 00:01:39 2007 for BulmaGes by  doxygen 1.5.1