@inproceedings{d5ab4219716d4f7cada4e65cc62d5e2a,
title = "Factor automata of automata and applications",
abstract = "An efficient data structure for representing the full index of a set of strings is the factor automaton, the minimal deterministic automaton representing the set of all factors or substrings of these strings. This paper presents a novel analysis of the size of the factor automaton of an automaton, that is the minimal deterministic automaton accepting the set of factors of a finite set of strings, itself represented by a finite automaton. It shows that the factor automaton of a set of strings U has at most 2|Q| - 2 states, where Q is the number of nodes of a prefix-tree representing the strings in U, a bound that significantly improves over 2||U|| - 1, the bound given by Blumer et al. (1987), where ||U|| is the sum of the lengths of all strings in U. It also gives novel and general bounds for the size of the factor automaton of an automaton as a function of the size of the original automaton and the maximal length of a suffix shared by the strings it accepts. Our analysis suggests that the use of factor automata of automata can be practical for large-scale applications, a fact that is further supported by the results of our experiments applying factor automata to a music identification task with more than 15,000 songs.",
keywords = "Factor automata, Finite automata, Information retrieval, Inverted files, Music identification, Suffix automata, Suffix trees, Text indexing",
author = "Mehryar Mohri and Pedro Moreno and Eugene Weinstein",
year = "2007",
doi = "10.1007/978-3-540-76336-9_17",
language = "English (US)",
isbn = "9783540763352",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "168--179",
booktitle = "Implementation and Application of Automata - 12th International Conference, CIAA 2007, Revised Selected Papers",
note = "12th International Conference on Implementation and Application of Automata, CIAA 2007 ; Conference date: 16-07-2007 Through 18-07-2007",
}