Page 1 sur 1

Script PHP de l’algorithme de SimHash

MessagePosté: Mer 12 Nov 2014 13:51
par Nicolaseo
Code: Tout sélectionner
< ?php

error_reporting(E_ALL);

// ================ Fonctions ========================================

function hexbin($str_hex) {
  $str_bin = FALSE;
  for ($i=0; $i < strlen($str_hex); $i++) {
    $str_bin .= sprintf("%04s", decbin(hexdec($str_hex[$i])));
  }
  return $str_bin;
}

// SimHash
function Charikar_SimHash($tokens) {
   $V = array_fill (0, HASHBITS, 0);
   foreach ($tokens as $key  => $value)
      for ($i = 0; $i < HASHBITS; $i++)
         if ($value["hash"][$i] == 1)
            $V[$i] = intval($V[$i]) + intval($value["weight"]);
         else
            $V[$i] = intval($V[$i]) - intval($value["weight"]);   
   return $V;
}

// fingerprint SimHash au format binaire
function SimHashfingerprint($V) {
   $fingerprint = array_fill (0, HASHBITS, 0);
   for ($i = 0; $i < HASHBITS; $i++)
      if ($V[$i] >= 0 ) $fingerprint[$i] = 1;
   return $fingerprint;
}

function SimHashHamming($V1, $V2) {
    $Distancehamming = 0;
   for ($i = 0; $i < HASHBITS; $i++)
      if ($V1[$i] <> $V2[$i]) $Distancehamming += 1;
   return $Distancehamming;
}

// ====================================================================


define("HASHBITS", 32);

$T1 = "Pres de la cheminee mon chat est endormi sur le tapis. Mon chat se nomme blacky.";
$T2 = "Mon chat blacky dort sur le tapis a cote de la cheminee.";
$T3 = "J’ai un chat de couleur noir qui se nomme blacky. Mon chat aime dormir sur le tapis pres de la cheminee.";

echo "Check Duplicate Content by Charikar SimHash" . "
";
echo "===========================================" . "

";


// On recherche tous les mots et calcul du poids
preg_match_all('/\b[a-z0-9]+\b/i', $T1, $words1);

// calcul du poids de chaque token et de la clé de hashage
foreach (array_count_values($words1[0]) as $key => $weight) {
   $tokens1[$key]["weight"] = $weight;
   $tokens1[$key]["hash"] = hexbin(hash('md5', $key));
}
preg_match_all('/\b[a-z0-9]+\b/i', $T2, $words2);
foreach (array_count_values($words2[0]) as $key => $weight) {
   $tokens2[$key]["weight"] = $weight;
   $tokens2[$key]["hash"] = hexbin(hash('md5', $key));
}
preg_match_all('/\b[a-z0-9]+\b/i', $T3, $words3);
foreach (array_count_values($words3[0]) as $key => $weight) {
   $tokens3[$key]["weight"] = $weight;
   $tokens3[$key]["hash"] = hexbin(hash('md5', $key));
}
      
$fingerprint1 = SimHashfingerprint(Charikar_SimHash($tokens1));
echo "T1: " . $T1 . "
";
echo "T1 SimHash fingerprint: ";
foreach ($fingerprint1 as $value) echo $value . " ";
echo  "

";

$fingerprint2 = SimHashfingerprint(Charikar_SimHash($tokens2));
echo "T2: " . $T2 . "
";
echo "T2 SimHash fingerprint: ";
foreach ($fingerprint2 as $value) echo $value . " ";
echo  "

";

$fingerprint3 = SimHashfingerprint(Charikar_SimHash($tokens3));
echo "T3: " . $T3 . "
";
echo "T3 SimHash fingerprint: ";
foreach ($fingerprint3 as $value) echo $value . " ";
echo  "

";

echo "Distance de Hamming (T1,T2): " . SimHashHamming($fingerprint1, $fingerprint2) . "
";
echo "Distance de Hamming (T1,T3): " . SimHashHamming($fingerprint1, $fingerprint3) . "
";
echo "Distance de Hamming (T2,T3): " . SimHashHamming($fingerprint2, $fingerprint3) . "
";

?>

Re: Script PHP de l’algorithme de SimHash

MessagePosté: Mer 12 Nov 2014 13:54
par Nicolaseo
un autre code a tester :

Code: Tout sélectionner
runWithWords($words[0]);
}

public function runWithWords(array $words)
{
$tokens = array();

foreach (array_count_values($words) as $key => $weight)
{
$tokens[$key]['weight'] = $weight;
$tokens[$key]['hash'] = $this->hexbin(md5($key));
}

return $this->runWithTokens($tokens);
}

public function runWithTokens(array $tokens)
{
return $this->fingerprint($this->vectorize($tokens));
}

protected function vectorize($tokens)
{
$vector = array_fill(0, self::HASHBITS, 0);

foreach($tokens as $key => $value)
{
for ($i = 0; $i < self::HASHBITS; $i++)
{
if ($value['hash'][$i] == 1)
$vector[$i] = intval($vector[$i]) + intval($value['weight']);
else
$vector[$i] = intval($vector[$i]) – intval($value['weight']);
}
}

return $vector;
}

protected function fingerprint($vector)
{
$fingerprint = str_pad('', self::HASHBITS, '0');

for ($i = 0; $i = 0 ) $fingerprint[$i] = ’1′;
}

return bindec($fingerprint);
}

protected function hexbin($str_hex)
{
$str_bin = ”;

for ($i = 0; $i run(’3937 9947 17210 6387 7314 2242 20394 11864 18364 5422 1469 15953 670 1529 14236 1556 14958 3472 16509 11353 4334 17874 18714 19051 16699 15168 14866 6767 5607 5607 3072 4669 16117 12790 17331 8775 8775 8775 2662 11887 1868 984 8936 15286 14558 10620 11613 1333 7769 17427 17788′);

$fp2 = $simHash->run(’3937 9947 17210 6387 7314 2242 20394 11864 18364 5422 1469 15953 670 1529 14236 1556 14958 3472 16509 11353 4334 17874 18714 19051 16699 15168 14866 6767 5607 5607 3072 4669 16117 17331 17331 8775 8775 2662 11887 1868 984 8936 15286 14558 10620 11613 1333 7769 17427 17788′);

$start = microtime(true);

$weights = array();

for($i = 0; $i compare($fp1, $fp2);
}

echo microtime(true) – $start;