summaryrefslogtreecommitdiff
path: root/backend/djvu/djvu-text-page.c
diff options
context:
space:
mode:
Diffstat (limited to 'backend/djvu/djvu-text-page.c')
-rw-r--r--backend/djvu/djvu-text-page.c445
1 files changed, 445 insertions, 0 deletions
diff --git a/backend/djvu/djvu-text-page.c b/backend/djvu/djvu-text-page.c
new file mode 100644
index 00000000..3f171d1e
--- /dev/null
+++ b/backend/djvu/djvu-text-page.c
@@ -0,0 +1,445 @@
+/*
+ * Implements search and copy functionality for Djvu files.
+ * Copyright (C) 2006 Michael Hofmann <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <glib.h>
+#include <libdjvu/miniexp.h>
+#include "djvu-text-page.h"
+
+
+/**
+ * djvu_text_page_selection_process:
+ * @page: #DjvuTextPage instance
+ * @p: s-expression to append
+ * @delimit: character/word/... delimiter
+ *
+ * Appends the string in @p to the page text.
+ *
+ * Returns: whether the end was not reached in this s-expression
+ */
+static gboolean
+djvu_text_page_selection_process (DjvuTextPage *page,
+ miniexp_t p,
+ int delimit)
+{
+ if (page->text || p == page->start) {
+ char *token_text = (char *) miniexp_to_str (miniexp_nth (5, p));
+ if (page->text) {
+ char *new_text =
+ g_strjoin (delimit & 2 ? "\n" :
+ delimit & 1 ? " " : NULL,
+ page->text, token_text,
+ NULL);
+ g_free (page->text);
+ page->text = new_text;
+ } else
+ page->text = g_strdup (token_text);
+ if (p == page->end)
+ return FALSE;
+ }
+ return TRUE;
+}
+
+/**
+ * djvu_text_page_selection:
+ * @page: #DjvuTextPage instance
+ * @p: tree to append
+ * @delimit: character/word/... delimiter
+ *
+ * Walks the tree in @p and appends the text with
+ * djvu_text_page_selection_process() for all s-expressions
+ * between the start and end fields.
+ *
+ * Returns: whether the end was not reached in this subtree
+ */
+static gboolean
+djvu_text_page_selection (DjvuTextPage *page,
+ miniexp_t p,
+ int delimit)
+{
+ g_return_val_if_fail (miniexp_consp (p) && miniexp_symbolp
+ (miniexp_car (p)), FALSE);
+
+ if (miniexp_car (p) != page->char_symbol)
+ delimit |= miniexp_car (p) == page->word_symbol ? 1 : 2;
+
+ miniexp_t deeper = miniexp_cddr (miniexp_cdddr (p));
+ while (deeper != miniexp_nil) {
+ miniexp_t str = miniexp_car (deeper);
+ if (miniexp_stringp (str)) {
+ if (!djvu_text_page_selection_process
+ (page, p, delimit))
+ return FALSE;
+ } else {
+ if (!djvu_text_page_selection
+ (page, str, delimit))
+ return FALSE;
+ }
+ delimit = 0;
+ deeper = miniexp_cdr (deeper);
+ }
+ return TRUE;
+}
+
+static void
+djvu_text_page_limits_process (DjvuTextPage *page,
+ miniexp_t p,
+ EvRectangle *rect)
+{
+ EvRectangle current;
+
+ current.x1 = miniexp_to_int (miniexp_nth (1, p));
+ current.y1 = miniexp_to_int (miniexp_nth (2, p));
+ current.x2 = miniexp_to_int (miniexp_nth (3, p));
+ current.y2 = miniexp_to_int (miniexp_nth (4, p));
+ if (current.x2 >= rect->x1 && current.y1 <= rect->y2 &&
+ current.x1 <= rect->x2 && current.y2 >= rect->y1) {
+ if (page->start == miniexp_nil)
+ page->start = p;
+ page->end = p;
+ }
+}
+
+
+static void
+djvu_text_page_limits (DjvuTextPage *page,
+ miniexp_t p,
+ EvRectangle *rect)
+{
+ g_return_if_fail (miniexp_consp (p) &&
+ miniexp_symbolp (miniexp_car (p)));
+
+ miniexp_t deeper = miniexp_cddr (miniexp_cdddr (p));
+ while (deeper != miniexp_nil) {
+ miniexp_t str = miniexp_car (deeper);
+ if (miniexp_stringp (str))
+ djvu_text_page_limits_process (page, p, rect);
+ else
+ djvu_text_page_limits (page, str, rect);
+
+ deeper = miniexp_cdr (deeper);
+ }
+}
+
+char *
+djvu_text_page_copy (DjvuTextPage *page,
+ EvRectangle *rectangle)
+{
+ char* text;
+
+ page->start = miniexp_nil;
+ page->end = miniexp_nil;
+ djvu_text_page_limits (page, page->text_structure, rectangle);
+ djvu_text_page_selection (page, page->text_structure, 0);
+
+ /* Do not free the string */
+ text = page->text;
+ page->text = NULL;
+
+ return text;
+}
+
+/**
+ * djvu_text_page_position:
+ * @page: #DjvuTextPage instance
+ * @position: index in the page text
+ *
+ * Returns the closest s-expression that contains the given position in
+ * the page text.
+ *
+ * Returns: closest s-expression
+ */
+static miniexp_t
+djvu_text_page_position (DjvuTextPage *page,
+ int position)
+{
+ GArray *links = page->links;
+ int low = 0;
+ int hi = links->len - 1;
+ int mid = 0;
+
+ g_return_val_if_fail (hi >= 0, miniexp_nil);
+
+ /* Shamelessly copied from GNU classpath */
+ while (low <= hi) {
+ mid = (low + hi) >> 1;
+ DjvuTextLink *link =
+ &g_array_index (links, DjvuTextLink, mid);
+ if (link->position == position)
+ break;
+ else if (link->position > position)
+ hi = --mid;
+ else
+ low = mid + 1;
+ }
+
+ return g_array_index (page->links, DjvuTextLink, mid).pair;
+}
+
+/**
+ * djvu_text_page_union:
+ * @target: first rectangle and result
+ * @source: second rectangle
+ *
+ * Calculates the bounding box of two rectangles and stores the reuslt
+ * in the first.
+ */
+static void
+djvu_text_page_union (EvRectangle *target,
+ EvRectangle *source)
+{
+ if (source->x1 < target->x1)
+ target->x1 = source->x1;
+ if (source->x2 > target->x2)
+ target->x2 = source->x2;
+ if (source->y1 < target->y1)
+ target->y1 = source->y1;
+ if (source->y2 > target->y2)
+ target->y2 = source->y2;
+}
+
+/**
+ * djvu_text_page_sexpr_process:
+ * @page: #DjvuTextPage instance
+ * @p: s-expression to append
+ * @start: first s-expression in the selection
+ * @end: last s-expression in the selection
+ *
+ * Appends the rectangle defined by @p to the internal bounding box rectangle.
+ *
+ * Returns: whether the end was not reached in this s-expression
+ */
+static gboolean
+djvu_text_page_sexpr_process (DjvuTextPage *page,
+ miniexp_t p,
+ miniexp_t start,
+ miniexp_t end)
+{
+ if (page->bounding_box || p == start) {
+ EvRectangle *new_rectangle = ev_rectangle_new ();
+ new_rectangle->x1 = miniexp_to_int (miniexp_nth (1, p));
+ new_rectangle->y1 = miniexp_to_int (miniexp_nth (2, p));
+ new_rectangle->x2 = miniexp_to_int (miniexp_nth (3, p));
+ new_rectangle->y2 = miniexp_to_int (miniexp_nth (4, p));
+ if (page->bounding_box) {
+ djvu_text_page_union (page->bounding_box,
+ new_rectangle);
+ g_free (new_rectangle);
+ } else
+ page->bounding_box = new_rectangle;
+ if (p == end)
+ return FALSE;
+ }
+ return TRUE;
+}
+
+/**
+ * djvu_text_page_sexpr:
+ * @page: #DjvuTextPage instance
+ * @p: tree to append
+ * @start: first s-expression in the selection
+ * @end: last s-expression in the selection
+ *
+ * Walks the tree in @p and extends the rectangle with
+ * djvu_text_page_process() for all s-expressions between @start and @end.
+ *
+ * Returns: whether the end was not reached in this subtree
+ */
+static gboolean
+djvu_text_page_sexpr (DjvuTextPage *page,
+ miniexp_t p,
+ miniexp_t start,
+ miniexp_t end)
+{
+ g_return_val_if_fail (miniexp_consp (p) && miniexp_symbolp
+ (miniexp_car (p)), FALSE);
+
+ miniexp_t deeper = miniexp_cddr (miniexp_cdddr (p));
+ while (deeper != miniexp_nil) {
+ miniexp_t str = miniexp_car (deeper);
+ if (miniexp_stringp (str)) {
+ if (!djvu_text_page_sexpr_process
+ (page, p, start, end))
+ return FALSE;
+ } else {
+ if (!djvu_text_page_sexpr
+ (page, str, start, end))
+ return FALSE;
+ }
+ deeper = miniexp_cdr (deeper);
+ }
+ return TRUE;
+}
+
+/**
+ * djvu_text_page_box:
+ * @page: #DjvuTextPage instance
+ * @start: first s-expression in the selection
+ * @end: last s-expression in the selection
+ *
+ * Builds a rectangle that contains all s-expressions in the given range.
+ */
+static EvRectangle *
+djvu_text_page_box (DjvuTextPage *page,
+ miniexp_t start,
+ miniexp_t end)
+{
+ page->bounding_box = NULL;
+ djvu_text_page_sexpr (page, page->text_structure, start, end);
+ return page->bounding_box;
+}
+
+/**
+ * djvu_text_page_append_search:
+ * @page: #DjvuTextPage instance
+ * @p: tree to append
+ * @case_sensitive: do not ignore case
+ * @delimit: insert spaces because of higher (sentence/paragraph/...) break
+ *
+ * Appends the tree in @p to the internal text string.
+ */
+static void
+djvu_text_page_append_text (DjvuTextPage *page,
+ miniexp_t p,
+ gboolean case_sensitive,
+ gboolean delimit)
+{
+ char *token_text;
+
+ g_return_if_fail (miniexp_consp (p) &&
+ miniexp_symbolp (miniexp_car (p)));
+
+ delimit |= page->char_symbol != miniexp_car (p);
+
+ miniexp_t deeper = miniexp_cddr (miniexp_cdddr (p));
+ while (deeper != miniexp_nil) {
+ miniexp_t data = miniexp_car (deeper);
+ if (miniexp_stringp (data)) {
+ DjvuTextLink link;
+ link.position = page->text == NULL ? 0 :
+ strlen (page->text);
+ link.pair = p;
+ g_array_append_val (page->links, link);
+
+ token_text = (char *) miniexp_to_str (data);
+ if (!case_sensitive)
+ token_text = g_utf8_casefold (token_text, -1);
+ if (page->text == NULL)
+ page->text = g_strdup (token_text);
+ else {
+ char *new_text =
+ g_strjoin (delimit ? " " : NULL,
+ page->text, token_text,
+ NULL);
+ g_free (page->text);
+ page->text = new_text;
+ }
+ if (!case_sensitive)
+ g_free (token_text);
+ } else
+ djvu_text_page_append_text (page, data,
+ case_sensitive, delimit);
+ delimit = FALSE;
+ deeper = miniexp_cdr (deeper);
+ }
+}
+
+/**
+ * djvu_text_page_search:
+ * @page: #DjvuTextPage instance
+ * @text: text to search
+ *
+ * Searches the page for the given text. The results list has to be
+ * externally freed afterwards.
+ */
+void
+djvu_text_page_search (DjvuTextPage *page,
+ const char *text)
+{
+ char *haystack = page->text;
+ int search_len;
+ EvRectangle *result;
+ if (page->links->len == 0)
+ return;
+
+ search_len = strlen (text);
+ while ((haystack = strstr (haystack, text)) != NULL) {
+ int start_p = haystack - page->text;
+ miniexp_t start = djvu_text_page_position (page, start_p);
+ int end_p = start_p + search_len - 1;
+ miniexp_t end = djvu_text_page_position (page, end_p);
+ result = djvu_text_page_box (page, start, end);
+ g_assert (result);
+ page->results = g_list_prepend (page->results, result);
+ haystack = haystack + search_len;
+ }
+ page->results = g_list_reverse (page->results);
+}
+
+
+/**
+ * djvu_text_page_prepare_search:
+ * @page: #DjvuTextPage instance
+ * @case_sensitive: do not ignore case
+ *
+ * Indexes the page text and prepares the page for subsequent searches.
+ */
+void
+djvu_text_page_prepare_search (DjvuTextPage *page,
+ gboolean case_sensitive)
+{
+ djvu_text_page_append_text (page, page->text_structure,
+ case_sensitive, FALSE);
+}
+
+/**
+ * djvu_text_page_new:
+ * @text: S-expression of the page text
+ *
+ * Creates a new page to search.
+ *
+ * Returns: new #DjvuTextPage instance
+ */
+DjvuTextPage *
+djvu_text_page_new (miniexp_t text)
+{
+ DjvuTextPage *page;
+
+ page = g_new0 (DjvuTextPage, 1);
+ page->links = g_array_new (FALSE, FALSE, sizeof (DjvuTextLink));
+ page->char_symbol = miniexp_symbol ("char");
+ page->word_symbol = miniexp_symbol ("word");
+ page->text_structure = text;
+ return page;
+}
+
+/**
+ * djvu_text_page_free:
+ * @page: #DjvuTextPage instance
+ *
+ * Frees the given #DjvuTextPage instance.
+ */
+void
+djvu_text_page_free (DjvuTextPage *page)
+{
+ g_free (page->text);
+ g_array_free (page->links, TRUE);
+ g_free (page);
+}