Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

read index from Redis #7

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ Menjalankan
2. Aktifkan module `mbstring` pada PHP dengan command `sudo apt-get install phpX.Y-mbstring` (sesuaikan `X.Y` dengan versi PHP yang terinstal)
3. Langsung kunjungi di browser

Tidak ada setup khusus karena aplikasi ini tidak menggunakan database.
##### Optional
Anda bisa mengaktifkan index menggunakan Redis dengan langkah :
1. Mengganti value `use_redis` pada *web/search.php* dengan **true**
2. Pastikan service Redis telah aktif pada host server
3. Jalankan *redis-indexer.php* dari repository https://github.com/lafzi/lafzi-indexer pada host server

Disarankan menggunakan sistem operasi Linux karena sistem cache mengandalkan command di Linux.

Lisensi
Expand Down
1 change: 1 addition & 0 deletions lib/predis
Submodule predis added at 98ec0c
113 changes: 77 additions & 36 deletions search/search_ff.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,14 @@
include_once '../lib/trigram.php';
include_once '../lib/array_utility.php';
include_once '../lib/doc_class.php';
include_once '../lib/predis/autoload.php';

// fungsi pencari
// param : $query_final yang siap cari (sudah melalui pengodean fonetik)
// $term_list_filename nama file term list
// $post_list_filename nama file posting list
// $vocal aktif atau tidak
// $score_order true jika ingin menghitung keterurutan kemunculan term
// return : array of found_doc object
function search($query_final, $term_list_filename, $post_list_filename, $score_order = true, $filtered = true, $filter_threshold = 0.8) {

// baca seluruh term list simpan dalam hashmap
$term_hashmap = array();
$term_list = fopen($term_list_filename, 'r');

while (($line = fgets($term_list, 32)) !== false) {
list($term, $offset) = explode('|', $line);
$term_hashmap[$term] = intval($offset);
}

fclose($term_list);

// akses posting list
$post_list_file = new SplFileObject($post_list_filename);
function search($query_final, $vocal, $score_order = true, $filtered = true, $filter_threshold = 0.8, $use_redis = false) {

// ekstrak trigram dari query
$query_trigrams = trigram_frekuensi_posisi($query_final);
Expand All @@ -44,29 +30,84 @@ function search($query_final, $term_list_filename, $post_list_filename, $score_o
foreach ($query_trigrams as $query_trigram => $qtfp) {
list($qt_freq, $qt_pos) = $qtfp;

if (isset($term_hashmap[$query_trigram])) {
// ambil posting list yang sesuai untuk trigram ini
$post_list_file->fseek($term_hashmap[$query_trigram]);
$matched_posting_lists = explode(';', trim($post_list_file->current()));

// untuk setiap posting list untuk trigram ini
foreach ($matched_posting_lists as $data) {
list ($doc_id, $term_freq, $term_pos) = explode(':', $data);
$term_pos = explode(',', $term_pos);
//$term_pos = reset(explode(',', $term_pos));

// hitung jumlah kemunculan dll
if (isset($matched_docs[$doc_id])) {
$matched_docs[$doc_id]->matched_trigrams_count += ($qt_freq < $term_freq) ? $qt_freq : $term_freq;
} else {
$matched_docs[$doc_id] = new found_doc();
$matched_docs[$doc_id]->matched_trigrams_count = 1;
$matched_docs[$doc_id]->id = $doc_id;
if ($use_redis) {

Predis\Autoloader::register();
$redis = new Predis\Client();

$key_prefix = $vocal ? "vocal-" : "nonvocal-";
$key = $key_prefix.$query_trigram;

// index dari redis
if ($redis->exists($key)){
//ambil posting list yang sesuai untuk trigram ini
$matched_posting_lists = json_decode($redis->get($key),true);

// untuk setiap posting list untuk trigram ini
foreach ($matched_posting_lists as $data) {
list ($doc_id, $term_freq, $term_pos) = $data;

// hitung jumlah kemunculan dll
if (isset($matched_docs[$doc_id])) {
$matched_docs[$doc_id]->matched_trigrams_count += ($qt_freq < $term_freq) ? $qt_freq : $term_freq;
} else {
$matched_docs[$doc_id] = new found_doc();
$matched_docs[$doc_id]->matched_trigrams_count = 1;
$matched_docs[$doc_id]->id = $doc_id;
}

$matched_docs[$doc_id]->matched_terms[$query_trigram] = $term_pos; // $term_pos is an array
}
}

$matched_docs[$doc_id]->matched_terms[$query_trigram] = $term_pos; // $term_pos is an array
} else {

if ($vocal) {
$term_list_filename = "../data/index_termlist_vokal.txt";
$post_list_filename = "../data/index_postlist_vokal.txt";
} else {
$term_list_filename = "../data/index_termlist_nonvokal.txt";
$post_list_filename = "../data/index_postlist_nonvokal.txt";
}

// baca seluruh term list simpan dalam hashmap
$term_hashmap = array();
$term_list = fopen($term_list_filename, 'r');

while (($line = fgets($term_list, 32)) !== false) {
list($term, $offset) = explode('|', $line);
$term_hashmap[$term] = intval($offset);
}

fclose($term_list);

// akses posting list
$post_list_file = new SplFileObject($post_list_filename);

// index dari file
if (isset($term_hashmap[$query_trigram])) {
// ambil posting list yang sesuai untuk trigram ini
$post_list_file->fseek($term_hashmap[$query_trigram]);
$matched_posting_lists = explode(';', trim($post_list_file->current()));

// untuk setiap posting list untuk trigram ini
foreach ($matched_posting_lists as $data) {
list ($doc_id, $term_freq, $term_pos) = explode(':', $data);
$term_pos = explode(',', $term_pos);
//$term_pos = reset(explode(',', $term_pos));

// hitung jumlah kemunculan dll
if (isset($matched_docs[$doc_id])) {
$matched_docs[$doc_id]->matched_trigrams_count += ($qt_freq < $term_freq) ? $qt_freq : $term_freq;
} else {
$matched_docs[$doc_id] = new found_doc();
$matched_docs[$doc_id]->matched_trigrams_count = 1;
$matched_docs[$doc_id]->id = $doc_id;
}

$matched_docs[$doc_id]->matched_terms[$query_trigram] = $term_pos; // $term_pos is an array
}
}
}
}

Expand Down
18 changes: 7 additions & 11 deletions web/search.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
// fwrite($lf, $ls);
// fclose($lf);

$configs = (object) array(
'use_redis' => false
);

if (isset($_GET['q']) && $_GET['q'] != "") {

//if (isset($_GET['order'])) {
Expand Down Expand Up @@ -37,14 +41,6 @@

$query_final = id_fonetik($query, !$vowel);
$query_trigrams_count = strlen($query_final) - 2;

if ($vowel) {
$term_list_filename = "../data/index_termlist_vokal.txt";
$post_list_filename = "../data/index_postlist_vokal.txt";
} else {
$term_list_filename = "../data/index_termlist_nonvokal.txt";
$post_list_filename = "../data/index_postlist_nonvokal.txt";
}

// baca data teks quran untuk ditampilkan

Expand Down Expand Up @@ -83,18 +79,18 @@

// pertama dengan threshold 0.8
$th = 0.95; //0.8;
$matched_docs = search($query_final, $term_list_filename, $post_list_filename, $order, $filtered, $th);
$matched_docs = search($query_final, $vowel, $order, $filtered, $th, $configs->use_redis);

// jika ternyata tanpa hasil, turunkan threshold jadi 0.7
if(count($matched_docs) == 0) {
$th = 0.8; //0.7;
$matched_docs = search($query_final, $term_list_filename, $post_list_filename, $order, $filtered, $th);
$matched_docs = search($query_final, $vowel, $order, $filtered, $th, $configs->use_redis);
}

// jika ternyata tanpa hasil, turunkan threshold jadi 0.6
if(count($matched_docs) == 0) {
$th = 0.7; //0.6;
$matched_docs = search($query_final, $term_list_filename, $post_list_filename, $order, $filtered, $th);
$matched_docs = search($query_final, $vowel, $order, $filtered, $th, $configs->use_redis);
}

// jika masih tanpa hasil, ya sudah
Expand Down