Source: pdf_find_controller.js

/* Copyright 2012 Mozilla Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import { createPromiseCapability } from 'pdfjs-dist'
import { getCharacterType } from './pdf_find_utils.js'
import { scrollIntoView } from './ui_utils.js'

const FindState = {
  FOUND: 0,
  NOT_FOUND: 1,
  WRAPPED: 2,
  PENDING: 3,
}

const FIND_TIMEOUT = 250 // ms
const MATCH_SCROLL_OFFSET_TOP = -50 // px
const MATCH_SCROLL_OFFSET_LEFT = -400 // px

const CHARACTERS_TO_NORMALIZE = {
  '\u2018': "'", // Left single quotation mark
  '\u2019': "'", // Right single quotation mark
  '\u201A': "'", // Single low-9 quotation mark
  '\u201B': "'", // Single high-reversed-9 quotation mark
  '\u201C': '"', // Left double quotation mark
  '\u201D': '"', // Right double quotation mark
  '\u201E': '"', // Double low-9 quotation mark
  '\u201F': '"', // Double high-reversed-9 quotation mark
  '\u00BC': '1/4', // Vulgar fraction one quarter
  '\u00BD': '1/2', // Vulgar fraction one half
  '\u00BE': '3/4', // Vulgar fraction three quarters
}

let normalizationRegex = null
function normalize(text) {
  if (!normalizationRegex) {
    // Compile the regular expression for text normalization once.
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('')
    normalizationRegex = new RegExp(`[${replace}]`, 'g')
  }
  return text.replace(normalizationRegex, function (ch) {
    return CHARACTERS_TO_NORMALIZE[ch]
  })
}

/**
 * @typedef {Object} PDFFindControllerOptions
 * @property {IPDFLinkService} linkService - The navigation/linking service.
 * @property {EventBus} eventBus - The application event bus.
 */

/**
 * Provides search functionality to find a given string in a PDF document.
 */
class PDFFindController {
  /**
   * @param {PDFFindControllerOptions} options
   */
  constructor({ linkService, eventBus }) {
    this._linkService = linkService
    this._eventBus = eventBus

    this._reset()
    eventBus._on('findbarclose', this._onFindBarClose.bind(this))
  }

  get highlightMatches() {
    return this._highlightMatches
  }

  get pageMatches() {
    return this._pageMatches
  }

  get pageMatchesLength() {
    return this._pageMatchesLength
  }

  get selected() {
    return this._selected
  }

  get state() {
    return this._state
  }

  /**
   * Set a reference to the PDF document in order to search it.
   * Note that searching is not possible if this method is not called.
   *
   * @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
   */
  setDocument(pdfDocument) {
    if (this._pdfDocument) {
      this._reset()
    }
    if (!pdfDocument) {
      return
    }
    this._pdfDocument = pdfDocument
    this._firstPageCapability.resolve()
  }

  executeCommand(cmd, state) {
    if (!state) {
      return
    }
    const pdfDocument = this._pdfDocument

    if (this._state === null || this._shouldDirtyMatch(cmd, state)) {
      this._dirtyMatch = true
    }
    this._state = state
    if (cmd !== 'findhighlightallchange') {
      this._updateUIState(FindState.PENDING)
    }

    this._firstPageCapability.promise.then(() => {
      // If the document was closed before searching began, or if the search
      // operation was relevant for a previously opened document, do nothing.
      if (
        !this._pdfDocument ||
        (pdfDocument && this._pdfDocument !== pdfDocument)
      ) {
        return
      }
      this._extractText()

      const findbarClosed = !this._highlightMatches
      const pendingTimeout = !!this._findTimeout

      if (this._findTimeout) {
        clearTimeout(this._findTimeout)
        this._findTimeout = null
      }
      if (cmd === 'find') {
        // Trigger the find action with a small delay to avoid starting the
        // search when the user is still typing (saving resources).
        this._findTimeout = setTimeout(() => {
          this._nextMatch()
          this._findTimeout = null
        }, FIND_TIMEOUT)
      } else if (this._dirtyMatch) {
        // Immediately trigger searching for non-'find' operations, when the
        // current state needs to be reset and matches re-calculated.
        this._nextMatch()
      } else if (cmd === 'findagain') {
        this._nextMatch()

        // When the findbar was previously closed, and `highlightAll` is set,
        // ensure that the matches on all active pages are highlighted again.
        if (findbarClosed && this._state.highlightAll) {
          this._updateAllPages()
        }
      } else if (cmd === 'findhighlightallchange') {
        // If there was a pending search operation, synchronously trigger a new
        // search *first* to ensure that the correct matches are highlighted.
        if (pendingTimeout) {
          this._nextMatch()
        } else {
          this._highlightMatches = true
        }
        this._updateAllPages() // Update the highlighting on all active pages.
      } else {
        this._nextMatch()
      }
    })
  }

  scrollMatchIntoView({ element = null, pageIndex = -1, matchIndex = -1 }) {
    if (!this._scrollMatches || !element) {
      return
    } else if (matchIndex === -1 || matchIndex !== this._selected.matchIdx) {
      return
    } else if (pageIndex === -1 || pageIndex !== this._selected.pageIdx) {
      return
    }
    this._scrollMatches = false // Ensure that scrolling only happens once.

    const spot = {
      top: MATCH_SCROLL_OFFSET_TOP,
      left: MATCH_SCROLL_OFFSET_LEFT,
    }
    scrollIntoView(element, spot, /* skipOverflowHiddenElements = */ true)
  }

  _reset() {
    this._highlightMatches = false
    this._scrollMatches = false
    this._pdfDocument = null
    this._pageMatches = []
    this._pageMatchesLength = []
    this._state = null
    // Currently selected match.
    this._selected = {
      pageIdx: -1,
      matchIdx: -1,
    }
    // Where the find algorithm currently is in the document.
    this._offset = {
      pageIdx: null,
      matchIdx: null,
      wrapped: false,
    }
    this._extractTextPromises = []
    this._pageContents = [] // Stores the normalized text for each page.
    this._matchesCountTotal = 0
    this._pagesToSearch = null
    this._pendingFindMatches = Object.create(null)
    this._resumePageIdx = null
    this._dirtyMatch = false
    clearTimeout(this._findTimeout)
    this._findTimeout = null

    this._firstPageCapability = createPromiseCapability()
  }

  /**
   * @type {string} The (current) normalized search query.
   */
  get _query() {
    if (this._state.query !== this._rawQuery) {
      this._rawQuery = this._state.query
      this._normalizedQuery = normalize(this._state.query)
    }
    return this._normalizedQuery
  }

  _shouldDirtyMatch(cmd, state) {
    // When the search query changes, regardless of the actual search command
    // used, always re-calculate matches to avoid errors (fixes bug 1030622).
    if (state.query !== this._state.query) {
      return true
    }
    switch (cmd) {
      case 'findagain':
        const pageNumber = this._selected.pageIdx + 1
        const linkService = this._linkService
        // Only treat a 'findagain' event as a new search operation when it's
        // *absolutely* certain that the currently selected match is no longer
        // visible, e.g. as a result of the user scrolling in the document.
        //
        // NOTE: If only a simple `this._linkService.page` check was used here,
        // there's a risk that consecutive 'findagain' operations could "skip"
        // over matches at the top/bottom of pages thus making them completely
        // inaccessible when there's multiple pages visible in the viewer.
        if (
          pageNumber >= 1 &&
          pageNumber <= linkService.pagesCount &&
          pageNumber !== linkService.page &&
          !linkService.isPageVisible(pageNumber)
        ) {
          return true
        }
        return false
      case 'findhighlightallchange':
        return false
    }
    return true
  }

  /**
   * Helper for multi-term search that fills the `matchesWithLength` array
   * and handles cases where one search term includes another search term (for
   * example, "tamed tame" or "this is"). It looks for intersecting terms in
   * the `matches` and keeps elements with a longer match length.
   */
  _prepareMatches(matchesWithLength, matches, matchesLength) {
    function isSubTerm(currentIndex) {
      const currentElem = matchesWithLength[currentIndex]
      const nextElem = matchesWithLength[currentIndex + 1]

      // Check for cases like "TAMEd TAME".
      if (
        currentIndex < matchesWithLength.length - 1 &&
        currentElem.match === nextElem.match
      ) {
        currentElem.skipped = true
        return true
      }

      // Check for cases like "thIS IS".
      for (let i = currentIndex - 1; i >= 0; i--) {
        const prevElem = matchesWithLength[i]
        if (prevElem.skipped) {
          continue
        }
        if (prevElem.match + prevElem.matchLength < currentElem.match) {
          break
        }
        if (
          prevElem.match + prevElem.matchLength >=
          currentElem.match + currentElem.matchLength
        ) {
          currentElem.skipped = true
          return true
        }
      }
      return false
    }

    // Sort the array of `{ match: <match>, matchLength: <matchLength> }`
    // objects on increasing index first and on the length otherwise.
    matchesWithLength.sort(function (a, b) {
      return a.match === b.match
        ? a.matchLength - b.matchLength
        : a.match - b.match
    })
    for (let i = 0, len = matchesWithLength.length; i < len; i++) {
      if (isSubTerm(i)) {
        continue
      }
      matches.push(matchesWithLength[i].match)
      matchesLength.push(matchesWithLength[i].matchLength)
    }
  }

  /**
   * Determine if the search query constitutes a "whole word", by comparing the
   * first/last character type with the preceding/following character type.
   */
  _isEntireWord(content, startIdx, length) {
    if (startIdx > 0) {
      const first = content.charCodeAt(startIdx)
      const limit = content.charCodeAt(startIdx - 1)
      if (getCharacterType(first) === getCharacterType(limit)) {
        return false
      }
    }
    const endIdx = startIdx + length - 1
    if (endIdx < content.length - 1) {
      const last = content.charCodeAt(endIdx)
      const limit = content.charCodeAt(endIdx + 1)
      if (getCharacterType(last) === getCharacterType(limit)) {
        return false
      }
    }
    return true
  }

  _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
    const matches = []
    const queryLen = query.length

    let matchIdx = -queryLen
    while (true) {
      matchIdx = pageContent.indexOf(query, matchIdx + queryLen)
      if (matchIdx === -1) {
        break
      }
      if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
        continue
      }
      matches.push(matchIdx)
    }
    this._pageMatches[pageIndex] = matches
  }

  _calculateWordMatch(query, pageIndex, pageContent, entireWord) {
    const matchesWithLength = []

    // Divide the query into pieces and search for text in each piece.
    const queryArray = query.match(/\S+/g)
    for (let i = 0, len = queryArray.length; i < len; i++) {
      const subquery = queryArray[i]
      const subqueryLen = subquery.length

      let matchIdx = -subqueryLen
      while (true) {
        matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen)
        if (matchIdx === -1) {
          break
        }
        if (
          entireWord &&
          !this._isEntireWord(pageContent, matchIdx, subqueryLen)
        ) {
          continue
        }
        // Other searches do not, so we store the length.
        matchesWithLength.push({
          match: matchIdx,
          matchLength: subqueryLen,
          skipped: false,
        })
      }
    }

    // Prepare arrays for storing the matches.
    this._pageMatchesLength[pageIndex] = []
    this._pageMatches[pageIndex] = []

    // Sort `matchesWithLength`, remove intersecting terms and put the result
    // into the two arrays.
    this._prepareMatches(
      matchesWithLength,
      this._pageMatches[pageIndex],
      this._pageMatchesLength[pageIndex]
    )
  }

  _calculateMatch(pageIndex) {
    let pageContent = this._pageContents[pageIndex]
    let query = this._query
    const { caseSensitive, entireWord, phraseSearch } = this._state

    if (query.length === 0) {
      // Do nothing: the matches should be wiped out already.
      return
    }

    if (!caseSensitive) {
      pageContent = pageContent.toLowerCase()
      query = query.toLowerCase()
    }

    if (phraseSearch) {
      this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord)
    } else {
      this._calculateWordMatch(query, pageIndex, pageContent, entireWord)
    }

    // When `highlightAll` is set, ensure that the matches on previously
    // rendered (and still active) pages are correctly highlighted.
    if (this._state.highlightAll) {
      this._updatePage(pageIndex)
    }
    if (this._resumePageIdx === pageIndex) {
      this._resumePageIdx = null
      this._nextPageMatch()
    }

    // Update the match count.
    const pageMatchesCount = this._pageMatches[pageIndex].length
    if (pageMatchesCount > 0) {
      this._matchesCountTotal += pageMatchesCount
      this._updateUIResultsCount()
    }
  }

  _extractText() {
    // Perform text extraction once if this method is called multiple times.
    if (this._extractTextPromises.length > 0) {
      return
    }

    let promise = Promise.resolve()
    for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
      const extractTextCapability = createPromiseCapability()
      this._extractTextPromises[i] = extractTextCapability.promise

      promise = promise.then(() => {
        return this._pdfDocument
          .getPage(i + 1)
          .then((pdfPage) => {
            return pdfPage.getTextContent({
              normalizeWhitespace: true,
            })
          })
          .then(
            (textContent) => {
              const textItems = textContent.items
              const strBuf = []

              for (let j = 0, jj = textItems.length; j < jj; j++) {
                strBuf.push(textItems[j].str)
              }

              // Store the normalized page content (text items) as one string.
              this._pageContents[i] = normalize(strBuf.join(''))
              extractTextCapability.resolve(i)
            },
            (reason) => {
              console.error(
                `Unable to get text content for page ${i + 1}`,
                reason
              )
              // Page error -- assuming no text content.
              this._pageContents[i] = ''
              extractTextCapability.resolve(i)
            }
          )
      })
    }
  }

  _updatePage(index) {
    if (this._scrollMatches && this._selected.pageIdx === index) {
      // If the page is selected, scroll the page into view, which triggers
      // rendering the page, which adds the text layer. Once the text layer
      // is built, it will attempt to scroll the selected match into view.
      this._linkService.page = index + 1
    }

    this._eventBus.dispatch('updatetextlayermatches', {
      source: this,
      pageIndex: index,
    })
  }

  _updateAllPages() {
    this._eventBus.dispatch('updatetextlayermatches', {
      source: this,
      pageIndex: -1,
    })
  }

  _nextMatch() {
    const previous = this._state.findPrevious
    const currentPageIndex = this._linkService.page - 1
    const numPages = this._linkService.pagesCount

    this._highlightMatches = true

    if (this._dirtyMatch) {
      // Need to recalculate the matches, reset everything.
      this._dirtyMatch = false
      this._selected.pageIdx = this._selected.matchIdx = -1
      this._offset.pageIdx = currentPageIndex
      this._offset.matchIdx = null
      this._offset.wrapped = false
      this._resumePageIdx = null
      this._pageMatches.length = 0
      this._pageMatchesLength.length = 0
      this._matchesCountTotal = 0

      this._updateAllPages() // Wipe out any previously highlighted matches.

      for (let i = 0; i < numPages; i++) {
        // Start finding the matches as soon as the text is extracted.
        if (this._pendingFindMatches[i] === true) {
          continue
        }
        this._pendingFindMatches[i] = true
        this._extractTextPromises[i].then((pageIdx) => {
          delete this._pendingFindMatches[pageIdx]
          this._calculateMatch(pageIdx)
        })
      }
    }

    // If there's no query there's no point in searching.
    if (this._query === '') {
      this._updateUIState(FindState.FOUND)
      return
    }
    // If we're waiting on a page, we return since we can't do anything else.
    if (this._resumePageIdx) {
      return
    }

    const offset = this._offset
    // Keep track of how many pages we should maximally iterate through.
    this._pagesToSearch = numPages
    // If there's already a `matchIdx` that means we are iterating through a
    // page's matches.
    if (offset.matchIdx !== null) {
      const numPageMatches = this._pageMatches[offset.pageIdx].length
      if (
        (!previous && offset.matchIdx + 1 < numPageMatches) ||
        (previous && offset.matchIdx > 0)
      ) {
        // The simple case; we just have advance the matchIdx to select
        // the next match on the page.
        offset.matchIdx = previous ? offset.matchIdx - 1 : offset.matchIdx + 1
        this._updateMatch(/* found = */ true)
        return
      }
      // We went beyond the current page's matches, so we advance to
      // the next page.
      this._advanceOffsetPage(previous)
    }
    // Start searching through the page.
    this._nextPageMatch()
  }

  _matchesReady(matches) {
    const offset = this._offset
    const numMatches = matches.length
    const previous = this._state.findPrevious

    if (numMatches) {
      // There were matches for the page, so initialize `matchIdx`.
      offset.matchIdx = previous ? numMatches - 1 : 0
      this._updateMatch(/* found = */ true)
      return true
    }
    // No matches, so attempt to search the next page.
    this._advanceOffsetPage(previous)
    if (offset.wrapped) {
      offset.matchIdx = null
      if (this._pagesToSearch < 0) {
        // No point in wrapping again, there were no matches.
        this._updateMatch(/* found = */ false)
        // While matches were not found, searching for a page
        // with matches should nevertheless halt.
        return true
      }
    }
    // Matches were not found (and searching is not done).
    return false
  }

  _nextPageMatch() {
    if (this._resumePageIdx !== null) {
      console.error('There can only be one pending page.')
    }

    let matches = null
    do {
      const pageIdx = this._offset.pageIdx
      matches = this._pageMatches[pageIdx]
      if (!matches) {
        // The matches don't exist yet for processing by `_matchesReady`,
        // so set a resume point for when they do exist.
        this._resumePageIdx = pageIdx
        break
      }
    } while (!this._matchesReady(matches))
  }

  _advanceOffsetPage(previous) {
    const offset = this._offset
    const numPages = this._linkService.pagesCount
    offset.pageIdx = previous ? offset.pageIdx - 1 : offset.pageIdx + 1
    offset.matchIdx = null

    this._pagesToSearch--

    if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
      offset.pageIdx = previous ? numPages - 1 : 0
      offset.wrapped = true
    }
  }

  _updateMatch(found = false) {
    let state = FindState.NOT_FOUND
    const wrapped = this._offset.wrapped
    this._offset.wrapped = false

    if (found) {
      const previousPage = this._selected.pageIdx
      this._selected.pageIdx = this._offset.pageIdx
      this._selected.matchIdx = this._offset.matchIdx
      state = wrapped ? FindState.WRAPPED : FindState.FOUND

      // Update the currently selected page to wipe out any selected matches.
      if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
        this._updatePage(previousPage)
      }
    }

    this._updateUIState(state, this._state.findPrevious)
    if (this._selected.pageIdx !== -1) {
      // Ensure that the match will be scrolled into view.
      this._scrollMatches = true

      this._updatePage(this._selected.pageIdx)
    }
  }

  _onFindBarClose(evt) {
    const pdfDocument = this._pdfDocument
    // Since searching is asynchronous, ensure that the removal of highlighted
    // matches (from the UI) is async too such that the 'updatetextlayermatches'
    // events will always be dispatched in the expected order.
    this._firstPageCapability.promise.then(() => {
      // Only update the UI if the document is open, and is the current one.
      if (
        !this._pdfDocument ||
        (pdfDocument && this._pdfDocument !== pdfDocument)
      ) {
        return
      }
      // Ensure that a pending, not yet started, search operation is aborted.
      if (this._findTimeout) {
        clearTimeout(this._findTimeout)
        this._findTimeout = null
      }
      // Abort any long running searches, to avoid a match being scrolled into
      // view *after* the findbar has been closed. In this case `this._offset`
      // will most likely differ from `this._selected`, hence we also ensure
      // that any new search operation will always start with a clean slate.
      if (this._resumePageIdx) {
        this._resumePageIdx = null
        this._dirtyMatch = true
      }
      // Avoid the UI being in a pending state when the findbar is re-opened.
      this._updateUIState(FindState.FOUND)

      this._highlightMatches = false
      this._updateAllPages() // Wipe out any previously highlighted matches.
    })
  }

  _requestMatchesCount() {
    const { pageIdx, matchIdx } = this._selected
    let current = 0,
      total = this._matchesCountTotal
    if (matchIdx !== -1) {
      for (let i = 0; i < pageIdx; i++) {
        current += (this._pageMatches[i] && this._pageMatches[i].length) || 0
      }
      current += matchIdx + 1
    }
    // When searching starts, this method may be called before the `pageMatches`
    // have been counted (in `_calculateMatch`). Ensure that the UI won't show
    // temporarily broken state when the active find result doesn't make sense.
    if (current < 1 || current > total) {
      current = total = 0
    }
    return { current, total }
  }

  _updateUIResultsCount() {
    this._eventBus.dispatch('updatefindmatchescount', {
      source: this,
      matchesCount: this._requestMatchesCount(),
    })
  }

  _updateUIState(state, previous) {
    this._eventBus.dispatch('updatefindcontrolstate', {
      source: this,
      state,
      previous,
      matchesCount: this._requestMatchesCount(),
      rawQuery: this._state ? this._state.query : null,
    })
  }
}

export { FindState, PDFFindController }