From e854442e1f402349419be2205ddc47173ab9a3df Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 4 Aug 2015 12:52:49 +0300 Subject: [PATCH] more fts stuff for simple index --- include/functions2.php | 83 +++++++++++++++++------------------ include/rssfuncs.php | 11 +++++ schema/ttrss_schema_pgsql.sql | 2 + schema/versions/mysql/128.sql | 5 +++ schema/versions/pgsql/128.sql | 8 ++++ update.php | 36 +++++++++++++++ 6 files changed, 103 insertions(+), 42 deletions(-) create mode 100644 schema/versions/mysql/128.sql create mode 100644 schema/versions/pgsql/128.sql diff --git a/include/functions2.php b/include/functions2.php index 206673b7d..a9bb49df8 100644 --- a/include/functions2.php +++ b/include/functions2.php @@ -282,17 +282,10 @@ function search_to_sql($search) { - /*if (DB_TYPE == "pgsql") { - $search_escaped = db_escape_string($search); - - return array("(to_tsvector('english', SUBSTR(ttrss_entries.title, 0, 200) || ' ' || SUBSTR(content, 0, 800)) - @@ to_tsquery('$search_escaped'))", explode(" ", $search)); - }*/ - $keywords = str_getcsv($search, " "); $query_keywords = array(); $search_words = array(); - $search_query_leftover = ""; + $search_query_leftover = array(); foreach ($keywords as $k) { if (strpos($k, "-") === 0) { @@ -390,23 +383,27 @@ array_push($query_keywords, "(".SUBSTRING_FOR_DATE."(updated,1,LENGTH('$k')) $not = '$k')"); } else { - $search_query_leftover .= $k . " "; - if (!$not) array_push($search_words, $k); + if (DB_TYPE == "pgsql") { + $k = mb_strtolower($k); + array_push($search_query_leftover, $not ? "!$k" : $k); + } else { + array_push($query_keywords, "(UPPER(ttrss_entries.title) $not LIKE UPPER('%$k%') + OR UPPER(ttrss_entries.content) $not LIKE UPPER('%$k%'))"); + } - /*array_push($query_keywords, "(UPPER(ttrss_entries.title) $not LIKE UPPER('%$k%') - OR UPPER(ttrss_entries.content) $not LIKE UPPER('%$k%'))"); - if (!$not) array_push($search_words, $k);*/ + if (!$not) array_push($search_words, $k); } } } - if ($search_query_leftover) { - $search_query_leftover = db_escape_string($search_query_leftover); + if (count($search_query_leftover) > 0) { + $search_query_leftover = db_escape_string(implode(" & ", $search_query_leftover)); - array_push($query_keywords, - "(to_tsvector('simple', SUBSTR(ttrss_entries.title, 0, 200) || ' ' || SUBSTR(content, 0, 800)) - @@ to_tsquery('$search_query_leftover'))"); + if (DB_TYPE == "pgsql") { + array_push($query_keywords, + "(tsvector_combined @@ '$search_query_leftover'::tsquery)"); + } } @@ -713,29 +710,30 @@ if ($feed == -3) $first_id_query_strategy_part = "true"; - // if previous topmost article id changed that means our current pagination is no longer valid - $query = "SELECT DISTINCT - ttrss_feeds.title, - date_entered, - guid, - ttrss_entries.id, - ttrss_entries.title, - updated, - score, - marked, - published, - last_marked, - last_published - FROM - $from_qpart - WHERE - $feed_check_qpart - ttrss_user_entries.ref_id = ttrss_entries.id AND - ttrss_user_entries.owner_uid = '$owner_uid' AND - $search_query_part - $start_ts_query_part - $since_id_part - $first_id_query_strategy_part ORDER BY $order_by LIMIT 1"; + if (!$search) { + // if previous topmost article id changed that means our current pagination is no longer valid + $query = "SELECT DISTINCT + ttrss_feeds.title, + date_entered, + guid, + ttrss_entries.id, + ttrss_entries.title, + updated, + score, + marked, + published, + last_marked, + last_published + FROM + $from_qpart + WHERE + $feed_check_qpart + ttrss_user_entries.ref_id = ttrss_entries.id AND + ttrss_user_entries.owner_uid = '$owner_uid' AND + $search_query_part + $start_ts_query_part + $since_id_part + $first_id_query_strategy_part ORDER BY $order_by LIMIT 1"; if ($_REQUEST["debug"]) { print $query; @@ -743,12 +741,13 @@ $result = db_query($query); if ($result && db_num_rows($result) > 0) { - $first_id = (int) db_fetch_result($result, 0, "id"); + $first_id = (int)db_fetch_result($result, 0, "id"); if ($offset > 0 && $first_id && $check_first_id && $first_id != $check_first_id) { return array(-1, $feed_title, $feed_site_url, $last_error, $last_updated, $search_words, $first_id); } } + } $query = "SELECT DISTINCT date_entered, diff --git a/include/rssfuncs.php b/include/rssfuncs.php index 6eb4e6d98..5ebddf9ef 100644 --- a/include/rssfuncs.php +++ b/include/rssfuncs.php @@ -984,11 +984,22 @@ _debug("RID: $entry_ref_id, IID: $entry_int_id", $debug_enabled); + if (DB_TYPE == "pgsql") { + $tsvector_combined = db_escape_string(mb_substr($entry_title . ' ' . strip_tags($entry_content), + 0, 1000000)); + + $tsvector_qpart = "tsvector_combined = to_tsvector('simple', '$tsvector_combined'),"; + + } else { + $tsvector_qpart = ""; + } + db_query("UPDATE ttrss_entries SET title = '$entry_title', content = '$entry_content', content_hash = '$entry_current_hash', updated = '$entry_timestamp_fmt', + $tsvector_qpart num_comments = '$num_comments', plugin_data = '$entry_plugin_data', author = '$entry_author', diff --git a/schema/ttrss_schema_pgsql.sql b/schema/ttrss_schema_pgsql.sql index 2b37c95fb..4cdc15f9a 100644 --- a/schema/ttrss_schema_pgsql.sql +++ b/schema/ttrss_schema_pgsql.sql @@ -142,12 +142,14 @@ create table ttrss_entries (id serial not null primary key, num_comments integer not null default 0, comments varchar(250) not null default '', plugin_data text, + tsvector_combined tsvector, lang varchar(2), author varchar(250) not null default ''); -- create index ttrss_entries_title_index on ttrss_entries(title); create index ttrss_entries_date_entered_index on ttrss_entries(date_entered); create index ttrss_entries_updated_idx on ttrss_entries(updated); +create index ttrss_entries_tsvector_combined_idx on ttrss_entries using gin(tsvector_combined); create table ttrss_user_entries ( int_id serial not null primary key, diff --git a/schema/versions/mysql/128.sql b/schema/versions/mysql/128.sql new file mode 100644 index 000000000..0545cb3ae --- /dev/null +++ b/schema/versions/mysql/128.sql @@ -0,0 +1,5 @@ +BEGIN; + +UPDATE ttrss_version SET schema_version = 128; + +COMMIT; diff --git a/schema/versions/pgsql/128.sql b/schema/versions/pgsql/128.sql new file mode 100644 index 000000000..d85ce7fe9 --- /dev/null +++ b/schema/versions/pgsql/128.sql @@ -0,0 +1,8 @@ +BEGIN; + +alter table ttrss_entries add column tsvector_combined tsvector; +create index ttrss_entries_tsvector_combined_idx on ttrss_entries using gin(tsvector_combined); + +UPDATE ttrss_version SET schema_version = 128; + +COMMIT; diff --git a/update.php b/update.php index 521b956ad..06578aaa4 100755 --- a/update.php +++ b/update.php @@ -33,6 +33,7 @@ "update-schema", "convert-filters", "force-update", + "update-search-idx", "list-plugins", "help"); @@ -80,6 +81,7 @@ print " --log FILE - log messages to FILE\n"; print " --indexes - recreate missing schema indexes\n"; print " --update-schema - update database schema\n"; + print " --update-search-idx - update PostgreSQL fulltext search index\n"; print " --convert-filters - convert type1 filters to type2\n"; print " --force-update - force update of all feeds\n"; print " --list-plugins - list all available plugins\n"; @@ -330,6 +332,40 @@ } + if (isset($options["update-search-idx"])) { + echo "Generating search index...\n"; + + $result = db_query("SELECT COUNT(id) AS count FROM ttrss_entries"); + $count = db_fetch_result($result, 0, "count"); + + print "Total entries: $count.\n"; + + $offset = 0; + $limit = 1000; + + while (true) { + $result = db_query("SELECT id, title, content FROM ttrss_entries WHERE tsvector_combined IS NULL ORDER BY id LIMIT $limit OFFSET $offset"); + + if (db_num_rows($result) != 0) { + echo "Offset $offset...\n"; + + while ($line = db_fetch_assoc($result)) { + $tsvector_combined = db_escape_string(mb_substr($line['title'] . ' ' . strip_tags($line['content']), + 0, 1000000)); + + db_query("UPDATE ttrss_entries SET tsvector_combined = to_tsvector('simple', '$tsvector_combined') WHERE id = " . $line["id"]); + } + + $offset += $limit; + } else { + echo "All done.\n"; + break; + } + + } + + } + if (isset($options["list-plugins"])) { $tmppluginhost = new PluginHost(); $tmppluginhost->load_all($tmppluginhost::KIND_ALL);