%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%                                                                      %
%  The Complete Bibliography of the ``Information Retrieval'' journal  %
%                    Kluwer Academic Publishers                        %
%                                                                      %
%                    compiled and maintained by                        %
%                                                                      %
%                      Fabrizio Sebastiani                             %
%             Dipartimento di Matematica Pura e Applicata              %
%                     Universita' di Padova                            %
%       Via Giovanni Battista Belzoni, 7 - 35131 Padova, Italy         %
%               http://www.math.unipd.it/~fabseb60/                    %
%                                                                      %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%                               %
%   Volume 1, Issue 1/2, 1999   %
%                               %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


@article{Kantor99,
   author       = {Paul Kantor and Steve Robertson},
   title        = {Editorial},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {1},
   pages        = {5},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Banks99,
   author       = {David Banks and Paul Over and Nien-Fan Zhang},
   title        = {Blind Men and Elephants: Six Approaches to {TREC} data},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {1},
   pages        = {7--34},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The paper reviews six recent efforts to better understand
                   performance measurements on information retrieval (IR)
                   systems within the framework of the Text REtrieval
                   Conferences (TREC): analysis of variance, cluster
                   analyses, rank correlations, beadplots, multidimensional
                   scaling, and item response analysis. None of this work has
                   yielded any substantial new insights. Prospects that
                   additional work along these lines will yield more
                   interesting results vary but are in general not promising.
                   Some suggestions are made for paying greater attention to
                   richer descriptions of IR system behavior but within
                   smaller, better controlled settings.},
}
@article{Mani99,
   author       = {Inderjeet Mani and Eric Bloedorn},
   title        = {Summarizing Similarities and Differences Among Related
                   Documents},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {1},
   pages        = {35--67},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In many modern information retrieval applications, a
                   common problem which arises is the existence of multiple
                   documents covering similar information, as in the case of
                   multiple news stories about an event or a sequence of
                   events. A particular challenge for text summarization is
                   to be able to summarize the similarities and differences
                   in information content among these documents. The approach
                   described here exploits the results of recent progress in
                   information extraction to represent salient units of text
                   and their relationships. By exploiting meaningful
                   relations between units based on an analysis of text
                   cohesion and the context in which the comparison is
                   desired, the summarizer can pinpoint similarities and
                   differences, and align text segments. In evaluation
                   experiments, these techniques for exploiting cohesion
                   relations result in summaries which (i) help users more
                   quickly complete a retrieval task (ii) result in improved
                   alignment accuracy over baselines, and (iii) improve
                   identification of topic-relevant similarities and
                   differences.},
}
@article{Yang99,
   author       = {Yiming Yang},
   title        = {An Evaluation of Statistical Approaches to Text
                   Categorization},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {1},
   pages        = {69--90},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper focuses on a comparative evaluation of a
                   wide-range of text categorization methods, including
                   previously published results on the Reuters corpus and new
                   results of additional experiments. A controlled study
                   using three classifiers, kNN, LLSF and WORD, was conducted
                   to examine the impact of configuration variations in five
                   versions of Reuters on the observed performance of
                   classifiers. Analysis and empirical evidence suggest that
                   the evaluation results on some versions of Reuters were
                   significantly affected by the inclusion of a large portion
                   of unlabelled documents, mading those results difficult to
                   interpret and leading to considerable confusions in the
                   literature. Using the results evaluated on the other
                   versions of Reuters which exclude the unlabelled
                   documents, the performance of twelve methods are compared
                   directly or indirectly. For indirect compararions, kNN,
                   LLSF and WORD were used as baselines, since they were
                   evaluated on all versions of Reuters that exclude the
                   unlabelled documents. As a global observation, kNN, LLSF
                   and a neural network method had the best performance;
                   except for a Naive Bayes approach, the other learning
                   algorithms also performed relatively well.},
}
@article{Melucci99,
   author       = {Massimo Melucci},
   title        = {An Evaluation of Automatically Constructed Hypertexts for
                   Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {1},
   pages        = {91--114},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper assesses the retrieval effectiveness of
                   automatically constructed inter-document hypertext links
                   in Information Retrieval (IR). The objectives of the
                   experiments described are to obtain evidence concerning
                   the usefulness of querying and browsing automatically
                   constructed IR hypertexts. Links are built by using IR
                   techniques, as these enable rapid, automatic production of
                   hypertexts from a document collection for accessing the
                   collection itself. These tests are carried out in a
                   laboratory environment and through simulation of link
                   browsing. Results of experiments show that browsing has
                   little impact on the retrieval of relevant documents if
                   used in place of querying or relevance feedback methods,
                   though may be practical if used in combination with
                   them.},
}
@article{Hawking99,
   author       = {David Hawking and Paul Thistlewaite and Donna Harman},
   title        = {Scaling Up the {TREC} Collection},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {1},
   pages        = {115--137},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Due to the popularity of Web search engines, a large
                   proportion of real text retrieval queries are now
                   processed over collections measured in tens or hundreds of
                   gigabytes. A new Very Large test Collection (VLC) has been
                   created to support qualification, measurement and
                   comparison of systems operating at this level and to
                   permit the study of the properties of very large
                   collections. The VLC is an extension of the well-known
                   TREC collection and has been distributed under the same
                   conditions. A simple set of efficiency and effectiveness
                   measures have been defined to encourage comparability of
                   reporting. The 20 gigabyte first-edition of the VLC and a
                   representative 10\% sample have been used in a special
                   interest track of the 1997 Text Retrieval Conference
                   (TREC-6). The unaffordable cost of obtaining complete
                   relevance assessments over collections of this scale is
                   avoided by concentrating on early precision and relying on
                   the core TREC collection to support detailed effectiveness
                   studies. Results obtained by TREC-6 VLC track participants
                   are presented here. All groups observed a significant
                   increase in early precision as collection size increased.
                   Explanatory hypotheses are advanced for future empirical
                   testing. A 100 gigabyte second edition VLC (VLC2) has
                   recently been compiled and distributed for use in TREC-7
                   in 1998.},
}
@article{Vogt99,
   author       = {Christopher C. Vogt and Garrison W. Cottrell},
   title        = {Fusion Via a Linear Combination of Scores},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {1},
   pages        = {151--173},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We present a thorough analysis of the capabilities of the
                   linear combination (LC) model for fusion of information
                   retrieval systems. The LC model combines the results lists
                   of multiple IR systems by scoring each document using a
                   weighted sum of the scores from each of the component
                   systems. We first present both empirical and analytical
                   justification for the hypotheses that such a model should
                   only be used when the systems involved have high
                   performance, a large overlap of relevant documents, and a
                   small overlap of nonrelevant documents. The empirical
                   approach allows us to very accurately predict the
                   performance of a combined system. We also derive a formula
                   for a theoretically optimal weighting scheme for combining
                   2 systems. We introduce dÑthe difference between the
                   average score on relevant documents and the average score
                   on nonrelevant documentsÑas a performance measure which
                   not only allows mathematical reasoning about system
                   performance, but also allows the selection of weights
                   which generalize well to new documents. We describe a
                   number of experiments involving large numbers of different
                   IR systems which support these findings.},
}
@article{Boughanem99,
   author       = {M. Boughanem and C. Chrisment and L. Tamine},
   title        = {Genetic Approach to Query Space Exploration},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {1},
   pages        = {175--192},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper describes a genetic algorithm approach for
                   intelligent information retrieval. The goal is to find an
                   optimal set of documents which best matches the user's
                   needs by exploring and exploiting the document space. More
                   precisely, we define a specific genetic algorithm for
                   information retrieval based on knowledge based operators
                   and guided by a heuristic for relevance multi-modality
                   problem solving. Experiments with TREC-6 French data and
                   queries show the effectiveness of our approach.},
}
@article{Weigend99,
   author       = {Andreas S. Weigend and Erik D. Wiener and Jan O.
                   Pedersen},
   title        = {Exploiting Hierarchy in Text Categorization},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {1},
   pages        = {193--216},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {With the recent dramatic increase in electronic access to
                   documents, text categorizationÑthe task of assigning
                   topics to a given documentÑhas moved to the center of the
                   information sciences and knowledge management. This
                   article uses the structure that is present in the semantic
                   space of topics in order to improve performance in text
                   categorization: according to their meaning, topics can be
                   grouped together into ``meta-topics'', e.g., gold, silver,
                   and copper are all metals. The proposed architecture
                   matches the hierarchical structure of the topic space, as
                   opposed to a flat model that ignores the structure. It
                   accommodates both single and multiple topic assignments
                   for each document. Its probabilistic interpretation allows
                   its predictions to be combined in a principled way with
                   information from other sources. The first level of the
                   architecture predicts the probabilities of the meta-topic
                   groups. This allows the individual models for each topic
                   on the second level to focus on finer discriminations
                   within the group. Evaluating the performance of a
                   two-level implementation on the Reuters-22173 testbed of
                   newswire articles shows the most significant improvement
                   for rare classes.},
}
@article{Pirkola99,
   author       = {Ari Pirkola and Heikki Keskustalo and Kalervo
                   J{\"{a}}rvelin},
   title        = {The Effects of Conjunction, Facet Structure, and
                   Dictionary Combinations in Concept-Based Cross-Language
                   Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {1},
   pages        = {217--250},
   year         = {1999},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The paper studies concept-based cross-language
                   information retrieval (CLIR). The document collection was
                   a subset of the TREC collection. The test requests were
                   formed from TREC's health related topics. As translation
                   dictionaries the study used a general dictionary and a
                   domain-specific (=medical) dictionary. The effects of
                   translation method, conjunction, and facet order on the
                   effectiveness of concept-based cross-language queries were
                   studied, and concept-based structuring of cross-language
                   queries was compared to mechanical structuring based on
                   the output of dictionaries. The performance of translated
                   Finnish queries against English documents was compared to
                   the performance of original English queries against the
                   English documents, and the performance of different CLIR
                   query types was compared with one another. No major
                   difference was found between concept-based and mechanical
                   structuring. The best translation method was a
                   simultaneous look-up in the medical dictionary and the
                   general dictionary, in which case cross-language queries
                   performed as well as the original English queries. The
                   results showed that especially at high exhaustivity (the
                   number of mutually restrictive concepts in a request)
                   levels cross-language queries perform well in relation to
                   monolingual queries. This suggests that conjunction
                   disambiguates cross-language queries. An extensive study
                   was made of the relative importance of the concepts of
                   requests. On the basis of the classification data of
                   request concepts it was shown how the order of facets in a
                   query affects cross-language as well as monolingual
                   queries.},
}
@article{Markkula00,
   author       = {Marjo Markkula and Eero Sormunen},
   title        = {End-User Searching Challenges Indexing Practices in the
                   Digital Newspaper Photo Archive},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {1},
   pages        = {259--285},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Previous research in conceptual indexing methods of
                   images has furnished us with refined theoretical
                   frameworks characterising various aspects of images that
                   could and should be indexed using textual descriptors. The
                   development of digital image processing technologies has
                   bred a brigade of content-based indexing and retrieval
                   methods available for applications. What the users need
                   and in what kinds of environments different indexing and
                   retrieval methods are relevant, has remained an area of
                   less intensive research work.This article presents the
                   results of a field study concentrating on journalists as
                   users of a digital newspaper photo archive. The expressed
                   photo needs, applied selection criteria and observed
                   searching behaviours in journalists' daily work were
                   contrasted with the indexing practices applied by the
                   archivists. The results showed that the journalists
                   achieved satisfactory results when trivial query terms
                   were available, e.g. when photos of named persons were
                   needed. Browsing was the main searching strategy applied
                   by the journalists, but the system did not support
                   browsing well. The access problems faced by the users in
                   particular photo needs are discussed in detail. The paper
                   concludes by discussing the potential approaches in
                   developing both the concept-based and content-based
                   indexing methods as well as the user interfaces in photo
                   retrieval systems.},
}
@article{Frost00,
   author       = {C. Olivia Frost and Bradley Taylor and Anna Noakes and
                   Stephen Markel and Deborah Torres and Karen M.
                   Drabenstott},
   title        = {Browse and Search Patterns in a Digital Image Database},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {1},
   pages        = {287--313},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {A prototype image retrieval system with browse and search
                   capabilities was developed to investigate patterns of
                   searching a collection of digital visual images, as well
                   as factors, such as image size, resolution, and download
                   speed, which affect browsing. The subject populations were
                   art history specialists and non-specialists. Through focus
                   group interviews, a controlled test, post-test interviews
                   and an online survey, data was gathered to compare
                   preferences and actual patterns of use in browsing and
                   searching. While specialists preferred direct search to
                   browsing, and generalists used browsing as their preferred
                   mode, both user groups found each mode to play a role
                   depending on information need, and found value in a system
                   combining both browse and direct search. There were no
                   significant differences in performance among the search
                   modes of browse, search, and combined browse/search models
                   when the quasi-controlled study tested the different
                   modes.},
}
@article{Bookstein00,
   author       = {A. Bookstein and S.T. Klein and T. Raita},
   title        = {Simple {Bayesian} Model for Bitmap Compression},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {1},
   pages        = {315--328},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Bitmaps are a useful, but storage voracious, component of
                   many information retrieval systems. Earlier efforts to
                   compress bitmaps were based on models of bit generation,
                   particularly Markov models. While these permitted
                   considerable reduction in storage, the short memory of
                   Markov models may limit their compression efficiency. In
                   this paper we accept the state orientation of Markov
                   models, but introduce a Bayesian approach to assess the
                   state; the analysis is based on data accumulating in a
                   growing window. The paper describes the details of the
                   probabilistic assumptions governing the Bayesian analysis,
                   as well as the protocol for controlling the window that
                   receives the data. We find slight improvement over the
                   best performing strictly Markov models.},
}
@article{Kekalainen00,
   author       = {Jaana Kek{\"{a}}l{\"{a}}inen and Kalervo J{\"{a}}rvelin},
   title        = {The Co-Effects of Query Structure and Expansion on
                   Retrieval Performance in Probabilistic Text Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {1},
   pages        = {329--344},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The effects of query structures and query expansion (QE)
                   on retrieval performance were tested with a best match
                   retrieval system (InQuery). Query structure means the use
                   of operators to express the relations between search keys.
                   Six different structures were tested, representing strong
                   structures (e.g., queries with facets or concepts
                   identified) and weak structures (no concepts identified, a
                   query is 'a bag of search keys'). QE was based on
                   concepts, which were first selected from a searching
                   thesaurus, and then expanded by semantic relationships
                   given in the thesaurus. The expansion levels were (a) no
                   expansion, (b) a synonym expansion, (c) a narrower concept
                   expansion, (d) an associative concept expansion, and (e) a
                   cumulative expansion of all other expansions. With weak
                   structures and Boolean structured queries, QE was not very
                   effective. The best performance was achieved with a
                   combination of a facet structure, where search keys within
                   a facet were treated as instances of one search key (the
                   SYN operator), and the largest expansion.},
}
@article{Dunlop00,
   author       = {Mark Dunlop and Mounia Lalmas},
   title        = {Introduction},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {9--15},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Rijsbergen00,
   author       = {C.J. Van Rijsbergen},
   title        = {Another Look at the Logical Uncertainty Principle},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {17--26},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The Logical Uncertainty Principle is re-examined from the
                   point of classical logic. Two interpretations are given,
                   an objective one in terms of an axiomatic theory of
                   information, and a subjective one based on Ramsey's theory
                   of probability.},
}
@article{Crestani00,
   author       = {Fabio Crestani},
   title        = {Exploiting the Similarity of Non-Matching Terms at
                   Retrieval Time},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {27--47},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In classic Information Retrieval systems a relevant
                   document will not be retrieved in response to a query if
                   the document and query representations do not share at
                   least one term. This problem, known as ``term mismatch'',
                   has been recognised for a long time by the Information
                   Retrieval community and a number of possible solutions
                   have been proposed. Here I present a preliminary
                   investigation into a new class of retrieval models that
                   attempt to solve the term mismatch problem by exploiting
                   complete or partial knowledge of term similarity in the
                   term space. The use of term similarity enables to enhance
                   classic retrieval models by taking into account
                   non-matching terms. The theoretical advantages and
                   drawbacks of these models are presented and compared with
                   other models tackling the same problem. A preliminary
                   experimental investigation into the performance gain
                   achieved by exploiting term similarity with the proposed
                   models is presented and discussed.},
}
@article{Sanderson00,
   author       = {Mark Sanderson},
   title        = {Retrieving with Good Sense},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {49--69},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Although always present in text, word sense ambiguity
                   only recently became regarded as a problem to information
                   retrieval which was potentially solvable. The growth of
                   interest in word senses resulted from new directions taken
                   in disambiguation research. This paper first outlines this
                   research and surveys the resulting efforts in information
                   retrieval. Although the majority of attempts to improve
                   retrieval effectiveness were unsuccessful, much was learnt
                   from the research. Most notably a notion of under what
                   circumstance disambiguation may prove of use to
                   retrieval.},
}
@article{Adriani00,
   author       = {Mirna Adriani},
   title        = {Using Statistical Term Similarity for Sense
                   Disambiguation in Cross-Language Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {71--82},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {With the increasing availability of machine-readable
                   bilingual dictionaries, dictionary-based automatic query
                   translation has become a viable approach to Cross-Language
                   Information Retrieval (CLIR). In this approach, resolving
                   term ambiguity is a crucial step. We propose a sense
                   disambiguation technique based on a term-similarity
                   measure for selecting the right translation sense of a
                   query term. In addition, we apply a query expansion
                   technique which is also based on the term similarity
                   measure to improve the effectiveness of the translation
                   queries. The results of our Indonesian to English and
                   English to Indonesian CLIR experiments demonstrate the
                   effectiveness of the sense disambiguation technique. As
                   for the query expansion technique, it is shown to be
                   effective as long as the term ambiguity in the queries has
                   been resolved. In the effort to solve the term ambiguity
                   problem, we discovered that differences in the pattern of
                   word-formation between the two languages render query
                   translations from one language to the other difficult.},
}
@article{Ruthven00,
   author       = {Ian Ruthven},
   title        = {Incorporating Aspects of Information Use into Relevance
                   Feedback},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {83--88},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In this paper we look at some of the problems in
                   interacting with best-match retrieval systems. In
                   particular, we examine the areas of interaction, some
                   investigations of the complexity and breadth of
                   interaction and attempts to categorise user's information
                   seeking behaviour. We suggest that one of the difficulties
                   of traditional IR systems in supporting information
                   seeking is the way the information content of documents is
                   represented. We discuss an alternative representation,
                   based on how information is used within documents.},
}
@article{Campbell00,
   author       = {Iain Campbell},
   title        = {Interactive Evaluation of the Ostensive Model Using a New
                   Test Collection of Images with Multiple Relevance
                   Assessments},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {89--114},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The Ostensive Model proposes a manner of structuring the
                   uncertainty associated with individual relevance
                   judgements as sources of evidence in relevance feedback.
                   It proposes temporal profiles of uncertainty, motivating
                   the application of a particular class of discount function
                   with respect to the age of the evidence. This paper
                   presents an initial evaluation of the relative
                   effectiveness of different uncertainty discount
                   functions.A novel direct manipulation interface to a
                   multimedia retrieval system embodying the Ostensive Model
                   is outlined briefly. The paper describes the construction
                   and characteristics of a new image test collection
                   utilising multiple binary relevance assessments. The use
                   of such multiple assessments and multiple interpretations
                   of them are discussed. The evaluation environment is
                   detailed in terms of the interface, test collection, and
                   tasks set to users. Multiple interpretations of the
                   results, and the statistical significance of comparisons
                   are presented.The results obtained in the evaluation are
                   consistent with the proposals of the Ostensive
                   ModelÑreinforcing a particular evidence profile. The
                   results give clear pointers to further, more specific,
                   evaluations.},
}
@article{Reid00,
   author       = {Jane Reid},
   title        = {A Task-Oriented Non-Interactive Evaluation Methodology
                   for Information Retrieval Systems},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {2},
   pages        = {115--129},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Past research has identified many different types of
                   relevance in information retrieval (IR). So far, however,
                   most evaluation of IR systems has been through batch
                   experiments conducted with test collections containing
                   only expert, topical relevance judgements. Recently, there
                   has been some movement away from this traditional approach
                   towards interactive, more user-centred methods of
                   evaluation. However, these are expensive for evaluators in
                   terms both of time and of resources. This paper describes
                   a new evaluation methodology, using a task-oriented test
                   collection, which combines the advantages of traditional
                   non-interactive testing with a more user-centred emphasis.
                   The main features of a task-oriented test collection are
                   the adoption of the task, rather than the query, as the
                   primary unit of evaluation and the naturalistic character
                   of the relevance judgements.},
}
@article{Baird00,
   author       = {Henry S. Baird and Francine R. Chen},
   title        = {Introduction},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {139--140},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Mitra00,
   author       = {M. Mitra and B.B. Chaudhuri},
   title        = {Information Retrieval from Documents: A Survey},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {141--163},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Given the phenomenal growth in the variety and quantity
                   of data available to users through electronic media, there
                   is a great demand for efficient and effective ways to
                   organize and search through all this information. Besides
                   speech, our principal means of communication is through
                   visual media, and in particular, through documents. In
                   this paper, we provide an update on Doermann's
                   comprehensive survey (1998) of research results in the
                   broad area of document-based information retrieval. The
                   scope of this survey is also somewhat broader, and there
                   is a greater emphasis on relating document image analysis
                   methods to conventional IR methods.Documents are available
                   in a wide variety of formats. Technical papers are often
                   available as ASCII files of clean, correct, text. Other
                   documents may only be available as hardcopies. These
                   documents have to be scanned and stored as images so that
                   they may be processed by a computer. The textual content
                   of these documents may also be extracted and recognized
                   using OCR methods. Our survey covers the broad spectrum of
                   methods that are required to handle different formats like
                   text and images. The core of the paper focuses on methods
                   that manipulate document images directly, and perform
                   various information processing tasks such as retrieval,
                   categorization, and summarization, without attempting to
                   completely recognize the textual content of the document.
                   We start, however, with a brief overview of traditional IR
                   techniques that operate on clean text. We also discuss
                   research dealing with text that is generated by running
                   OCR on document images. Finally, we also briefly touch on
                   the related problem of content-based image retrieval.},
}
@article{Kantor00,
   author       = {Paul B. Kantor and Ellen M. Voorhees},
   title        = {The {TREC-5} Confusion Track: Comparing Retrieval Methods
                   for Scanned Text},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {165--176},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {A known-item search is a particular information retrieval
                   task in which the system is asked to find a single target
                   document in a large document set. The TREC-5 confusion
                   track used a set of 49 known-item tasks to study the
                   impact of data corruption on retrieval system performance.
                   Two corrupted versions of a 55,600 document corpus whose
                   true content was known were created by applying OCR
                   techniques to page images. The first version of the corpus
                   used the page images as scanned, resulting in an estimated
                   character error rate of approximately 5\%. The second
                   version used page images that had been down-sampled,
                   resulting in an estimated character error rate of
                   approximately 20\%. The true text and each of the
                   corrupted versions were then searched using the same set
                   of 49 questions. In general, retrieval methods that
                   attempted a probabilistic reconstruction of the original
                   clean text fared better than methods that simply accepted
                   corrupted versions of the query text.},
}
@article{Lopresti00,
   author       = {Daniel Lopresti and Jiangying Zhou},
   title        = {Locating and Recognizing Text in {WWW} Images},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {177--206},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The explosive growth of the World Wide Web has resulted
                   in a distributed database consisting of hundreds of
                   millions of documents. While existing search engines index
                   a page based on the text that is readily extracted from
                   its HTML encoding, an increasing amount of the information
                   on the Web is embedded in images. This situation presents
                   a new and exciting challenge for the fields of document
                   analysis and information retrieval, as WWW image text is
                   typically rendered in color and at very low spatial
                   resolutions. In this paper, we survey the results of
                   several years of our work in the area. For the problem of
                   locating text in Web images, we describe a procedure based
                   on clustering in color space followed by a
                   connected-components analysis that seems promising. For
                   character recognition, we discuss techniques using
                   polynomial surface fitting and ``fuzzy'' n-tuple
                   classifiers. Also presented are the results of several
                   experiments that demonstrate where our methods perform
                   well and where more work needs to be done. We conclude
                   with a discussion of topics for further research.},
}
@article{Williams00,
   author       = {William J. Williams and Eugene J. Zalubas and Alfred O.
                   Hero, {III}},
   title        = {Word Spotting in Bitmapped Fax Documents},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {207--226},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Images and signals may be represented by forms invariant
                   to time shifts, spatial shifts, frequency shifts, and
                   scale changes. Advances in time-frequency analysis and
                   scale transform techniques have made this possible.
                   However, factors such as noise contamination and ``style''
                   differences complicate this. An example is found in text,
                   where letters and words may vary in size and position.
                   Examples of complicating variations include the font used,
                   corruption during facsimile (fax) transmission, and
                   printer characteristics. The solution advanced in this
                   paper is to cast the desired invariants into separate
                   subspaces for each extraneous factor or group of factors.
                   The first goal is to have minimal overlap between these
                   subspaces and the second goal is to be able to identify
                   each subspace accurately. Concepts borrowed from
                   high-resolution spectral analysis, but adapted uniquely to
                   this problem have been found to be useful in this context.
                   Once the pertinent subspace is identified, the recognition
                   of a particular invariant form within this subspace is
                   relatively simple using well-known singular value
                   decomposition (SVD) techniques. The basic elements of the
                   approach can be applied to a variety of pattern
                   recognition problems. The specific application covered in
                   this paper is word spotting in bitmapped fax documents.},
}
@article{Hu00,
   author       = {Jianying Hu and Ramanujan Kashi and Gordon Wilfong},
   title        = {Comparison and Classification of Documents Based on
                   Layout Similarity},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {227--243},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper describes features and methods for document
                   image comparison and classification at the spatial layout
                   level. The methods are useful for visual similarity based
                   document retrieval as well as fast algorithms for initial
                   document type classification without OCR. A novel feature
                   set called interval encoding is introduced to capture
                   elements of spatial layout. This feature set encodes
                   region layout information in fixed-length vectors by
                   capturing structural characteristics of the image. These
                   fixed-length vectors are then compared to each other
                   through a Manhattan distance computation for fast page
                   layout comparison. The paper describes experiments and
                   results to rank-order a set of document pages in terms of
                   their layout similarity to a test document. We also
                   demonstrate the usefulness of the features derived from
                   interval coding in a hidden Markov model based page layout
                   classification system that is trainable and extendible.
                   The methods described in the paper can be used in various
                   document retrieval tasks including visual similarity based
                   retrieval, categorization and information extraction.},
}
@article{Srihari00,
   author       = {Rohini K. Srihari and Zhongfei Zhang and Aibing Rao},
   title        = {Intelligent Indexing and Semantic Retrieval of Multimodal
                   Documents},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2/3},
   volume       = {2},
   pages        = {245--275},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Finding useful information from large multimodal document
                   collections such as the WWW without encountering numerous
                   false positives poses a challenge to multimedia
                   information retrieval systems (MMIR). This research
                   addresses the problem of finding pictures. The fact that
                   images do not appear in isolation, but rather with
                   accompanying, collateral text is exploited. Taken
                   independently, existing techniques for picture retrieval
                   using (i) text-based and (ii) image-based methods have
                   several limitations. This research presents a general
                   model for multimodal information retrieval that addresses
                   the following issues: (i) users' information need, (ii)
                   expressing information need through composite, multimodal
                   queries, and (iii) determining the most appropriate
                   weighted combination of indexing techniques in order to
                   best satisfy information need. A machine learning approach
                   is proposed for the latter. The focus is on improving
                   precision and recall in a MMIR system by optimally
                   combining text and image similarity. Experiments are
                   presented which demonstrate the utility of individual
                   indexing systems in improving overall average precision.},
}
@article{Hughey00,
   author       = {M.K. Hughey and M.W. Berry},
   title        = {Improved Query Matching Using kd-Trees: A Latent Semantic
                   Indexing Enhancement},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {2},
   pages        = {287--302},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Efficient information searching and retrieval methods are
                   needed to navigate the ever increasing volumes of digital
                   information. Traditional lexical information retrieval
                   methods can be inefficient and often return inaccurate
                   results. To overcome problems such as polysemy and
                   synonymy, concept-based retrieval methods have been
                   developed. One such method is Latent Semantic Indexing
                   (LSI), a vector-space model, which uses the singular value
                   decomposition (SVD) of a term-by-document matrix to
                   represent terms and documents in k-dimensional space. As
                   with other vector-space models, LSI is an attempt to
                   exploit the underlying semantic structure of word usage in
                   documents. During the query matching phase of LSI, a
                   user's query is first projected into the term-document
                   space, and then compared to all terms and documents
                   represented in the vector space. Using some similarity
                   measure, the nearest (most relevant) terms and documents
                   are identified and returned to the user. The current LSI
                   query matching method requires that the similarity measure
                   be computed between the query and every term and document
                   in the vector space. In this paper, the kd-tree searching
                   algorithm is used within a recent LSI implementation to
                   reduce the time and computational complexity of query
                   matching. The kd-tree data structure stores the term and
                   document vectors in such a way that only those terms and
                   documents that are most likely to qualify as nearest
                   neighbors to the query will be examined and retrieved.},
}
@article{Turney00,
   author       = {Peter D. Turney},
   title        = {Learning Algorithms for Keyphrase Extraction},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {2},
   pages        = {303--336},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Many academic journals ask their authors to provide a
                   list of about five to fifteen keywords, to appear on the
                   first page of each article. Since these key words are
                   often phrases of two or more words, we prefer to call them
                   keyphrases. There is a wide variety of tasks for which
                   keyphrases are useful, as we discuss in this paper. We
                   approach the problem of automatically extracting
                   keyphrases from text as a supervised learning task. We
                   treat a document as a set of phrases, which the learning
                   algorithm must learn to classify as positive or negative
                   examples of keyphrases. Our first set of experiments
                   applies the C4.5 decision tree induction algorithm to this
                   learning task. We evaluate the performance of nine
                   different configurations of C4.5. The second set of
                   experiments applies the GenEx algorithm to the task. We
                   developed the GenEx algorithm specifically for
                   automatically extracting keyphrases from text. The
                   experimental results support the claim that a
                   custom-designed algorithm (GenEx), incorporating
                   specialized procedural domain knowledge, can generate
                   better keyphrases than a general-purpose algorithm (C4.5).
                   Subjective human evaluation of the keyphrases generated by
                   GenEx suggests that about 80\% of the keyphrases are
                   acceptable to human readers. This level of performance
                   should be satisfactory for a wide variety of
                   applications.},
}
@article{Wondergem00,
   author       = {B.C.M. Wondergem and P. Van Bommel and Th.P. Van Der
                   Weide},
   title        = {Matching Index Expressions for Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {2},
   pages        = {337--360},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The INN system is a dynamic hypertext tool for searching
                   and exploring the WWW. It uses a dynamically built
                   ancillary layer to support easy interaction. This layer
                   features the subexpressions of index expressions that are
                   extracted from rendered documents. Currently, the INN
                   system uses keyword based matching. The effectiveness of
                   the INN system may be increased by using matching
                   functions for index expressions. In the design of such
                   functions, several constraints stemming from the INN must
                   be taken into account. Important constraints are a limited
                   response time and storage space, a focus on discriminating
                   (different notions of) subexpressions for index
                   expressions, and domain independency. With these
                   contextual constraints in mind, several matching functions
                   are designed and both theoretically and practically
                   evaluated.},
}
@article{Jones00,
   author       = {Gareth Jones and Tetsuya Sakai and Masahiro Kajiura and
                   Kazuo Sumita},
   title        = {Incremental Relevance Feedback in {Japanese} Text
                   Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {2},
   pages        = {361--384},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The application of relevance feedback techniques has been
                   shown to improve retrieval performance for a number of
                   information retrieval tasks. This paper explores
                   incremental relevance feedback for ad hoc Japanese text
                   retrieval; examining, separately and in combination, the
                   utility of term reweighting and query expansion using a
                   probabilistic retrieval model. Retrieval performance is
                   evaluated in terms of standard precision-recall measures,
                   and also using ``number-to-view'' graphs. Experimental
                   results, on the standard BMIR-J2 Japanese language
                   retrieval collection, show that both term reweighting and
                   query expansion improve retrieval performance. This is
                   reflected in improvements in both precision and recall,
                   but also a reduction in the average number of documents
                   which must be viewed to find a selected number of relevant
                   items. In particular, using a simple simulation of user
                   searching, incremental application of relevance
                   information is shown to lead to progressively improved
                   retrieval performance and an overall reduction in the
                   number of documents that a user must view to find relevant
                   ones.},
}
@article{Zobel00,
   author       = {Justin Zobel},
   title        = {Guest Introduction},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {3},
   pages        = {5--6},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Klein00,
   author       = {Shmuel T. Klein},
   title        = {Skeleton Trees for the Efficient Decoding of {Huffman}
                   Encoded Texts},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {3},
   pages        = {7--23},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {A new data structure is investigated, which allows fast
                   decoding of texts encoded by canonical Huffman codes. The
                   storage requirements are much lower than for conventional
                   Huffman trees, $O(log^2 n)$ for trees of depth $O(log n)$,
                   and decoding is faster, because a part of the
                   bit-comparisons necessary for the decoding may be saved.
                   Empirical results on large real-life distributions show a
                   reduction of up to 50\% and more in the number of bit
                   operations. The basic idea is then generalized, yielding
                   further savings.},
}
@article{Moffat00,
   author       = {Alistair Moffat and Lang Stuiver},
   title        = {Binary Interpolative Coding for Effective Index
                   Compression},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {3},
   pages        = {25--47},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Information retrieval systems contain large volumes of
                   text, and currently have typical sizes into the gigabyte
                   range. Inverted indexes are one important method for
                   providing search facilities into these collections, but
                   unless compressed require a great deal of space. In this
                   paper we introduce a new method for compressing inverted
                   indexes that yields excellent compression, fast decoding,
                   and exploits clusteringÑthe tendency for words to appear
                   relatively frequently in some parts of the collection and
                   infrequently in others. We also describe two other quite
                   separate applications for the same compression method:
                   representing the MTF list positions generated by the
                   Burrows-Wheeler Block Sorting transformation; and
                   transmitting the codebook for semi-static block-based
                   minimum-redundancy coding.},
}
@article{Navarro00,
   author       = {Gonzalo Navarro and Edleno Silva De Moura and Marden
                   Neubert and Nivio Ziviani and Ricardo Baeza-Yates},
   title        = {Adding Compression to Block Addressing Inverted Indexes},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {3},
   pages        = {49--77},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Inverted index compression, block addressing and
                   sequential search on compressed text are three techniques
                   that have been separately developed for efficient,
                   low-overhead text retrieval. Modern text compression
                   techniques can reduce the text to less than 30\% of its
                   size and allow searching it directly and faster than the
                   uncompressed text. Inverted index compression obtains
                   significant reduction of its original size at the same
                   processing speed. Block addressing makes the inverted
                   lists point to text blocks instead of exact positions and
                   pay the reduction in space with some sequential text
                   scanning.In this work we combine the three ideas in a
                   single scheme. We present a compressed inverted file that
                   indexes compressed text and uses block addressing. We
                   consider different techniques to compress the index and
                   study their performance with respect to the block size. We
                   compare the index against three separate techniques for
                   varying block sizes, showing that our index is superior to
                   each isolated approach. For instance, with just 4\% of
                   extra space overhead the index has to scan less than 12\%
                   of the text for exact searches and about 20\% allowing one
                   error in the matches.},
}
@article{Wermter00,
   author       = {Stefan Wermter},
   title        = {Neural Network Agents for Learning Semantic Text
                   Classification},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {3},
   pages        = {87--103},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The research project AgNeT develops Agents for Neural
                   Text routing in the internet. Unrestricted potentially
                   faulty text messages arrive at a certain delivery point
                   (e.g. email address or world wide web address). These text
                   messages are scanned and then distributed to one of
                   several expert agents according to a certain task
                   criterium. Possible specific scenarios within this
                   framework include the learning of the routing of
                   publication titles or news titles. In this paper we
                   describe extensive experiments for semantic text routing
                   based on classified library titles and newswire titles.
                   This task is challenging since incoming messages may
                   contain constructions which have not been anticipated.
                   Therefore, the contributions of this research are in
                   learning and generalizing neural architectures for the
                   robust interpretation of potentially noisy unrestricted
                   messages. Neural networks were developed and examined for
                   this topic since they support robustness and learning in
                   noisy unrestricted real-world texts. We describe and
                   compare different sets of experiments. The first set of
                   experiments tests a recurrent neural network for the task
                   of library title classification. Then we describe a larger
                   more difficult newswire classification task from
                   information retrieval. The comparison of the examined
                   models demonstrates that techniques from information
                   retrieval integrated into recurrent plausibility networks
                   performed well even under noise and for different
                   corpora.},
}
@article{Moghrabi00,
   author       = {I.A.R. Moghrabi and R.A. Makholian},
   title        = {A New Approach to Clustering Records in Information
                   Retrieval Systems},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {3},
   pages        = {105--126},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This work introduces a new approach to record clustering
                   where a hybrid algorithm is presented to cluster records
                   based upon threshold values and the query patterns made to
                   a particular database. The Hamming Distance of a file is
                   used as a measure of space density. The objective of the
                   algorithm is to minimize the Hamming Distance of the file
                   while attaching significance to the most frequent queries
                   being asked. Simulation experiments conducted proved that
                   a great reduction in response time is yielded after the
                   restructuring of a file. We study the space density
                   properties of a file and how it affects retrieval time
                   before and after clustering, as a means of predicting file
                   performance and making appropriate choices of parameters.
                   Criteria, such as, block size, threshold value, percentage
                   of records satisfying a given set of queries, etc., which
                   affect clustering and response time are also studied.},
}
@article{McCallum00,
   author       = {Andrew K. McCallum and Kamal Nigam and Jason Rennie and
                   Kristie Seymore},
   title        = {Automating the Construction of Internet Portals with
                   Machine Learning},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {3},
   pages        = {127--163},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Domain-specific internet portals are growing in
                   popularity because they gather content from the Web and
                   organize it for easy access, retrieval and search. For
                   example, www.campsearch.com allows complex queries by age,
                   location, cost and specialty over summer camps. This
                   functionality is not possible with general, Web-wide
                   search engines. Unfortunately these portals are difficult
                   and time-consuming to maintain. This paper advocates the
                   use of machine learning techniques to greatly automate the
                   creation and maintenance of domain-specific Internet
                   portals. We describe new research in reinforcement
                   learning, information extraction and text classification
                   that enables efficient spidering, the identification of
                   informative text segments, and the population of topic
                   hierarchies. Using these techniques, we have built a
                   demonstration system: a portal for computer science
                   research papers. It already contains over 50,000 papers
                   and is publicly available at www.cora.justresearch.com.
                   These techniques are widely applicable to portal creation
                   in other domains.},
}
@article{Schauble00,
   author       = {Peter Sch{\"{a}}uble and Elke Mittendorf},
   title        = {Introduction},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {171--172},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Wechsler00,
   author       = {Martin Wechsler and Eugen Munteanu and Peter
                   Sch{\"{a}}uble},
   title        = {New Approaches to Spoken Document Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {173--188},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper presents four novel techniques for
                   open-vocabulary spoken document retrieval: a method to
                   detect slots that possibly contain a query feature; a
                   method to estimate occurrence probabilities; a technique
                   that we call collection-wide probability re-estimation and
                   a weighting scheme which takes advantage of the fact that
                   long query features are detected more reliably. These four
                   techniques have been evaluated using the TREC-6 spoken
                   document retrieval test collection to determine the
                   improvements in retrieval effectiveness with respect to a
                   baseline retrieval method. Results show that the retrieval
                   effectiveness can be improved considerably despite the
                   large number of speech recognition errors.},
}
@article{Mittendorf00,
   author       = {Elke Mittendorf and Peter Sch{\"{a}}uble},
   title        = {Information Retrieval can Cope with Many Errors},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {189--216},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The retrieval of documents that originate from digitized
                   and OCR-converted paper documents is an important task for
                   modern retrieval systems. The problems that OCR errors
                   cause for the retrieval process have been subject to
                   research for several years now. We approach the problem
                   from a theoretical point of view and model OCR conversion
                   as a random experiment. Our theoretical results, which are
                   supported by experiments, show clearly that information
                   retrieval can cope even with many errors. It is, however,
                   important that the documents are not too short and that
                   recognition errors are distributed appropriately among
                   words and documents. These results disclose that an
                   expensive manual or automatic post-processing of
                   OCR-converted documents usually does not make sense, but
                   that scanning and OCR must be performed in an appropriate
                   way and with care.},
}
@article{Wechsler00a,
   author       = {Martin Wechsler and Peter Sch{\"{a}}uble},
   title        = {The Probability Ranking Principle Revisited},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {217--227},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {A theoretic framework for multimedia information
                   retrieval is introduced which guarantees optimal retrieval
                   effectiveness. In particular, a Ranking Principle for
                   Distributed Multimedia-Documents (RPDM) is described
                   together with an algorithm that satisfies this principle.
                   Finally, the RPDM is shown to be a generalization of the
                   Probability Ranking principle (PRP) which guarantees
                   optimal retrieval effectiveness in the case of text
                   document retrieval. The PRP justifies theoretically the
                   relevance ranking adopted by modern search engines. In
                   contrast to the classical PRP, the new RPDM takes into
                   account transmission and inspection time, and most
                   importantly, aspectual recall rather than simple recall.},
}
@article{Csillaghy00,
   author       = {A. Csillaghy and H. Hinterberger and A.O. Benz},
   title        = {Content-Based Image Retrieval in Astronomy},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {229--241},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Content-based image retrieval in astronomy needs methods
                   that can deal with an image content made of noisy and
                   diffuse structures. This motivates investigations on how
                   information should be summarized and indexed for this
                   specific kind of images. The method we present first
                   summarizes the image information content by partitioning
                   the image in regions with same texture. We call this
                   process texture summarization. Second, indexing features
                   are generated by examining the distribution of parameters
                   describing image regions. Indexing features can be
                   associated with global or local image characteristics.
                   Both kinds of indexing features are evaluated on the
                   retrieval system of the Zurich archive of solar radio
                   spectrograms. The evaluation shows that generating local
                   indexing features using self-organizing maps yields the
                   best effectiveness of all tested methods.},
}
@article{Mittendorf00a,
   author       = {Elke Mittendorf and Bojidar Mateev and Peter
                   Sch{\"{a}}uble},
   title        = {Using the Co-occurrence of Words for Retrieval
                   Weighting},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {243--251},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We have applied the well-known Robertson-Sparck Jones
                   weighting to sets of indexing features that are different
                   from word-based features. Our features describe the
                   co-occurrences of words in a window range of predefined
                   size. The experiments have been designed to analyse the
                   value of features that are beyond word-based features but
                   all used retrieval methods can be motivated strictly in
                   the probabilistic framework. Among the several
                   implications of our experiments for weighted retrieval is
                   the surprising result that features that describe the
                   co-occurrences of words in sentence-size or paragraph-size
                   windows are significantly better descriptors than purely
                   word-based indexing features.},
}
@article{Baumgarten00,
   author       = {Christoph Baumgarten},
   title        = {Retrieving Information from a Distributed Heterogeneous
                   Document Collection},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {253--271},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper describes a probabilistic model for optimum
                   information retrieval in a distributed heterogeneous
                   environment.The model assumes the collection of documents
                   offered by the environment to be partitioned into
                   subcollections. Documents as well as subcollections have
                   to be indexed, where indexing methods using different
                   indexing vocabularies can be employed. A query provided by
                   a user is answered in terms of a ranked list of documents.
                   The model determines a procedure for ranking the documents
                   that stems from the Probability Ranking Principle: For
                   each subcollection, the subcollection's documents are
                   ranked; the resulting ranked lists are combined into a
                   final ranked list of documents, where the ordering is
                   determined by the documents' probabilities of being
                   relevant with respect to the user's query. Various
                   probabilistic ranking methods may be involved in the
                   distributed ranking process. A criterion for effectively
                   limiting the ranking process to a subset of subcollections
                   extends the model.The property that different ranking
                   methods and indexing vocabularies can be used is important
                   when the subcollections are heterogeneous with respect to
                   their content.The model's applicability is experimentally
                   confirmed. When exploiting the degrees of freedom provided
                   by the model, experiments showed evidence that the model
                   even outperforms comparable models for the non-distributed
                   case with respect to retrieval effectiveness.},
}
@article{Braschler00,
   author       = {Martin Braschler and Peter Sch{\"{a}}uble},
   title        = {Using Corpus-Based Approaches in a System for
                   Multilingual Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {3},
   pages        = {273--284},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We present a system for multilingual information
                   retrieval that allows users to formulate queries in their
                   preferred language and retrieve relevant information from
                   a collection containing documents in multiple languages.
                   The system is based on a process of document level
                   alignments, where documents of different languages are
                   paired according to their similarity. The resulting
                   mapping allows us to produce a multilingual comparable
                   corpus. Such a corpus has multiple interesting
                   applications. It allows us to build a data structure for
                   query translation in cross-language information retrieval
                   (CLIR). Moreover, we also perform pseudo relevance
                   feedback on the alignments to improve our retrieval
                   results. And finally, multiple retrieval runs can be
                   merged into one unified result list. The resulting system
                   is inexpensive, adaptable to domain-specific collections
                   and new languages and has performed very well at the
                   TREC-7 conference CLIR system comparison.},
}
@article{Liddy00,
   author       = {Elizabeth D. Liddy and Ted Diamond and Mary McKenna},
   title        = {{DR-LINK} in {TIPSTER III}},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {3},
   pages        = {291--311},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {A Natural Language Processing based Information Retrieval
                   System that was one of the original systems developed in
                   Phase I of TIPSTER, was the basis of research in TIPSTER
                   III the goal of which was to add two extended capabilities
                   to the core system. Following a description of the
                   multiple levels of linguistic processing that were
                   developed for the original DR-LINK System, details are
                   provided on research into query-specific data fusion and
                   query-specific cross-document summarization. Experimental
                   results show that there is potential for improving
                   retrieval through query-specific fusion and that analysts
                   found the Detailed Multiple Document Summary to be
                   extremely useful for almost every query, while the
                   Thumbnail sketch was useful in approximately 50\% of the
                   queries.},
}
@article{Kwok00,
   author       = {K.L. Kwok},
   title        = {Improving {English} and {Chinese} Ad-Hoc Retrieval: A
                   Tipster Text Phase 3 Project Report},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {3},
   pages        = {313--338},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Both English and Chinese ad-hoc information retrieval
                   were investigated in this Tipster 3 project. Part of our
                   objectives is to study the use of various term level and
                   phrasal level evidence to improve retrieval accuracy. For
                   short queries, we studied five term level techniques that
                   together can lead to good improvements over standard
                   ad-hoc 2-stage retrieval for TREC5-8 experiments. For long
                   queries, we studied the use of linguistic phrases to
                   re-rank retrieval lists. Its effect is small but
                   consistently positive.For Chinese IR, we investigated
                   three simple representations for documents and queries:
                   short-words, bigrams and characters. Both approximate
                   short-word segmentation or bigrams, augmented with
                   characters, give highly effective results. Accurate word
                   segmentation appears not crucial for overall result of a
                   query set. Character indexing by itself is not
                   competitive. Additional improvements may be obtained using
                   collection enrichment and combination of retrieval
                   lists.Our PIRCS document-focused retrieval is also shown
                   to have similarity with a simple language model approach
                   to IR.},
}
@article{Davis00,
   author       = {Mark W. Davis and William C. Ogden},
   title        = {Towards Universal Text Retrieval: Tipster Text Retrieval
                   Research at {New Mexico State University}},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {3},
   pages        = {339--356},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {New Mexico State University's Computing Research Lab has
                   participated in research in all three phases of the US
                   Government's Tipster program. Our work on information
                   retrieval has focused on research and development of
                   multilingual and cross-language approaches to automatic
                   retrieval. The work on automatic systems has been
                   supplemented by additional research into the role of the
                   IR system user in interactive retrieval scenarios:
                   monolingual, multilingual and cross-language. The combined
                   efforts suggest that ``universal'' text retrieval, in
                   which a user can find, access and use documents in the
                   face of language differences and information overload, may
                   be possible.},
}
@article{Yager00,
   author       = {Ronald R. Yager},
   title        = {A Hierarchical Document Retrieval Language},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {3},
   pages        = {357--377},
   year         = {2000},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The focus of this work is on the development of a
                   document retrieval language which attempts to enable users
                   to better represent their requirements with respect to
                   retrieved documents. We describe a framework for
                   evaluating documents which allows, in the spirit of
                   computing with words, a linguistic specification of the
                   interrelationship between the desired attributes. This
                   framework, which makes considerable use of the Ordered
                   Weighted Averaging (OWA) operator, also supports a
                   hierarchical structure which allows for an increased
                   expressiveness of queries.},
}
@article{Zhang01,
   author       = {Tong Zhang and Frank J. Oles},
   title        = {Text Categorization Based on Regularized Linear
                   Classification Methods},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {4},
   pages        = {5--31},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {A number of linear classification methods such as the
                   linear least squares fit (LLSF), logistic regression, and
                   support vector machines (SVM's) have been applied to text
                   categorization problems. These methods share the
                   similarity by finding hyperplanes that approximately
                   separate a class of document vectors from its complement.
                   However, support vector machines are so far considered
                   special in that they have been demonstrated to achieve the
                   state of the art performance. It is therefore worthwhile
                   to understand whether such good performance is unique to
                   the SVM design, or if it can also be achieved by other
                   linear classification methods. In this paper, we compare a
                   number of known linear classification methods as well as
                   some variants in the framework of regularized linear
                   systems. We will discuss the statistical and numerical
                   properties of these algorithms, with a focus on text
                   categorization. We will also provide some numerical
                   experiments to illustrate these algorithms on a number of
                   datasets.},
}
@article{Hawking01,
   author       = {David Hawking and Nick Craswell and Peter Bailey and
                   Kathleen Griffihs},
   title        = {Measuring Search Engine Quality},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {4},
   pages        = {33--59},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The effectiveness of twenty public search engines is
                   evaluated using TREC-inspired methods and a set of 54
                   queries taken from real Web search logs. The World Wide
                   Web is taken as the test collection and a combination of
                   crawler and text retrieval system is evaluated. The
                   engines are compared on a range of measures derivable from
                   binary relevance judgments of the first seven live results
                   returned. Statistical testing reveals a significant
                   difference between engines and high intercorrelations
                   between measures. Surprisingly, given the dynamic nature
                   of the Web and the time elapsed, there is also a high
                   correlation between results of this study and a previous
                   study by Gordon and Pathak. For nearly all engines, there
                   is a gradual decline in precision at increasing cutoff
                   after some initial fluctuation. Performance of the engines
                   as a group is found to be inferior to the group of
                   participants in the TREC-8 Large Web task, although the
                   best engines approach the median of those systems.
                   Shortcomings of current Web search evaluation methodology
                   are identified and recommendations are made for future
                   improvements. In particular, the present study and its
                   predecessors deal with queries which are assumed to derive
                   from a need to find a selection of documents relevant to a
                   topic. By contrast, real Web search reflects a range of
                   other information need types which require different
                   judging and different measures.},
}
@article{Zhang01a,
   author       = {Jin Zhang},
   title        = {The Characteristic Analysis of the {DARE} Visual Space},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {4},
   pages        = {61--78},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In this paper a distance-angle-based visual retrieval
                   tool DARE is introduced. The distance-based similarity
                   distribution, angle-based similarity distribution, and the
                   differences of their distributions in the visual space are
                   analyzed. The document cluster analysis in the visual
                   space and the document cluster comparison between the
                   document space and the visual space are addressed. A new
                   conceptÑDistance to Reference AxisÑis introduced to better
                   understand the visual space. The impact of other
                   operations in DARE on the document distribution is
                   discussed. Future research directions including
                   significance of the index term distribution in the visual
                   space and a user study are addressed.},
}
@article{Benkhalifa01,
   author       = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine
                   Bouyakhf},
   title        = {Integrating External Knowledge to Supplement Training
                   Data in Semi-Supervised Learning for Text Categorization},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {4},
   pages        = {91--113},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Text Categorization (TC) is the automated assignment of
                   text documents to predefined categories based on document
                   contents. TC has been an application for many learning
                   approaches, which prove effective. Nevertheless, TC
                   provides many challenges to machine learning. In this
                   paper, we suggest, for text categorization, the
                   integration of external WordNet lexical information to
                   supplement training data for a semi-supervised clustering
                   algorithm which can learn from both training and test
                   documents to classify new unseen documents. This algorithm
                   is the ``Semi-Supervised Fuzzy c-Means'' (ssFCM). Our
                   experiments use Reuters 21578 database and consist of
                   binary classifications for categories selected from the
                   115 TOPICS classes of the Reuters collection. Using the
                   Vector Space Model, each document is represented by its
                   original feature vector augmented with external feature
                   vector generated using WordNet. We verify experimentally
                   that the integration of WordNet helps ssFCM improve its
                   performance, effectively addresses the classification of
                   documents into categories with few training documents and
                   does not interfere with the use of training data.},
}
@article{Kim01,
   author       = {Jee-Hyub Kim and Byung-Kwan Kwak and Seungwoo Lee and
                   Geunbae Lee and Jong-Hyeok Lee},
   title        = {A Corpus-Based Learning Method of Compound Noun Indexing
                   Rules for {Korean}},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {4},
   pages        = {115--132},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In Korean information retrieval, compound nouns play an
                   important role in improving precision in search
                   experiments. There are two major approaches to compound
                   noun indexing in Korean: statistical and linguistic. Each
                   method, however, has its own shortcomings, such as
                   limitations when indexing diverse types of compound nouns,
                   over-generation of compound nouns, and data sparseness in
                   training. In this paper, we propose a corpus-based
                   learning method, which can index diverse types of compound
                   nouns using rules automatically extracted from a large
                   corpus. The automatic learning method is more portable and
                   requires less human effort, although it exhibits a
                   performance level similar to the manual-linguistic
                   approach. We also present a new filtering method to solve
                   the problems of compound noun over-generation and data
                   sparseness.},
}
@article{Goldberg01,
   author       = {Ken Goldberg and Theresa Roeder and Dhruv Gupta and Chris
                   Perkins},
   title        = {Eigentaste: A Constant Time Collaborative Filtering
                   Algorithm},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {4},
   pages        = {133--151},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Eigentaste is a collaborative filtering algorithm that
                   uses universal queries to elicit real-valued user ratings
                   on a common set of items and applies principal component
                   analysis (PCA) to the resulting dense subset of the
                   ratings matrix. PCA facilitates dimensionality reduction
                   for offline clustering of users and rapid computation of
                   recommendations. For a database of n users, standard
                   nearest-neighbor techniques require O(n) processing time
                   to compute recommendations, whereas Eigentaste requires
                   O(1) (constant) time. We compare Eigentaste to alternative
                   algorithms using data from Jester, an online joke
                   recommending system.Jester has collected approximately
                   2,500,000 ratings from 57,000 users. We use the Normalized
                   Mean Absolute Error (NMAE) measure to compare performance
                   of different algorithms. In the Appendix we use Uniform
                   and Normal distribution models to derive analytic
                   estimates of NMAE when predictions are random. On the
                   Jester dataset, Eigentaste computes recommendations two
                   orders of magnitude faster with no loss of accuracy.
                   Jester is online at: http://eigentaste.berkeley.edu},
}
@article{Lopresti01,
   author       = {Daniel P. Lopresti},
   title        = {A Comparison of Text-Based Methods for Detecting
                   Duplication in Scanned Document Databases},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {4},
   pages        = {153--173},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper presents an experimental evaluation of several
                   text-based methods for detecting duplication in scanned
                   document databases using uncorrected OCR output. This task
                   is made challenging both by the wide range of degradations
                   printed documents can suffer, and by conflicting
                   interpretations of what it means to be a ``duplicate.'' We
                   report results for four sets of experiments exploring
                   various aspects of the problem space. While the techniques
                   studied are generally robust in the face of most types of
                   OCR errors, there are nonetheless important differences
                   which we identify and discuss in detail.},
}
@article{Kekalainen01,
   author       = {Jaana Kek{\"{a}}l{\"{a}}inen},
   title        = {Information Retrieval Special Issue: Conceptual,
                   Linguistic and Task-Based {IR}: Research at the
                   {University of Tampere}},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {191--194},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Alkula01,
   author       = {Riitta Alkula},
   title        = {From Plain Character Strings to Meaningful Words:
                   Producing Better Full Text Databases for Inflectional and
                   Compounding Languages with Morphological Analysis
                   Software},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {195--208},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The paper deals with linguistic processing and retrieval
                   techniques in fulltext databases. Special attention is
                   focused on the characteristics of highly inflectional
                   languages, and how morphological structure of a language
                   should be taken into account, when designing and
                   developing information retrieval systems. Finnish is used
                   as an example of a language, which has a more complicated
                   inflectional structure than the English language. In the
                   FULLTEXT project, natural language analysis modules for
                   Finnish were incorporated into the commercial BASIS
                   information retrieval system, which is based on inverted
                   files and Boolean searching. Several test databases were
                   produced, each using one or two Finnish morphological
                   analysis programs.},
}
@article{Pirkola01,
   author       = {Ari Pirkola and Turid Hedlund and Heikki Keskustalo and
                   Kalervo J{\"{a}}rvelin},
   title        = {Dictionary-Based Cross-Language Information Retrieval:
                   Problems, Methods, and Research Findings},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {209--230},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper reviews literature on dictionary-based
                   cross-language information retrieval (CLIR) and presents
                   CLIR research done at the University of Tampere (UTA). The
                   main problems associated with dictionary-based CLIR, as
                   well as appropriate methods to deal with the problems are
                   discussed. We will present the structured query model by
                   Pirkola and report findings for four different language
                   pairs concerning the effectiveness of query structuring.
                   The architecture of our automatic query translation and
                   construction system is presented.},
}
@article{Jarvelinen01,
   author       = {Kalervo J{\"{a}}rvelin and Jaana Kek{\"{a}}l{\"{a}}inen
                   and Timo Niemi},
   title        = {{ExpansionTool}: Concept-Based Query Expansion and
                   Construction},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {231--255},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We develop a deductive data model for concept-based query
                   expansion. It is based on three abstraction levels: the
                   conceptual, linguistic and string levels. Concepts and
                   relationships among them are represented at the conceptual
                   level. The linguistic level gives natural language
                   expressions for concepts. Each expression has one or more
                   matching patterns at the string level. The models specify
                   the matching of the expression in database indices built
                   in varying ways. The data model supports a declarative
                   concept-based query expansion and formulation tool, the
                   ExpansionTool, for heterogeneous IR system environments.
                   Conceptual expansion is implemented by a novel intelligent
                   operator for traversing transitive relationships among
                   cyclic concept networks. The number of expansion links
                   followed, their types, and weights can be used to control
                   expansion. A sample empirical experiment illustrating the
                   use of the ExpansionTool in IR experiments is presented.},
}
@article{Sormunen01,
   author       = {Eero Sormunen},
   title        = {Extensions to the {STAIRS} Study -- Empirical Evidence
                   for the Hypothesised Ineffectiveness of {Boolean} Queries
                   in Large Full-Text Databases},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {257--273},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The STAIRS study conducted by Blair and Maron in the
                   mid-80's is a milestone in the history of IR evaluation.
                   Blair and Maron made strong conclusion about the
                   inadequacy of free-text searching large databases, and
                   their study has been widely referred in the literature to
                   justify the problems of effectiveness in IR systems.
                   However, some critics of the study have plausibly pointed
                   out that the ineffectiveness conclusions were not solidly
                   based on empirical data.This paper introduces a new
                   theoretical and empirical approach to study the problems
                   of high recall searching in large databases and reports
                   the results of a case experiment. The findings verify some
                   of the hypothetical conclusions introduced in the STAIRS
                   study, and expands the picture of falling performance. It
                   is shown that low precision in high recall searching is
                   unavoidable in exact-match Boolean searching since even
                   major concepts are often expressed implicitly in relevant
                   documents. The author suggests that the problem could be
                   reduced in facet-based best-match searching.},
}
@article{Markkula01,
   author       = {Marjo Markkula and Marius Tico and Bemmu Sepponen and
                   Katja Nirkkonen and Eero Sormunen},
   title        = {A Test Collection for the Evaluation of Content-Based
                   Image Retrieval Algorithms -- A User and Task-Based
                   Approach},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {275--293},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Content-based image retrieval (CBIR) algorithms have been
                   seen as a promising access method for digital photograph
                   collections. Unfortunately, we have very little evidence
                   of the usefulness of these algorithms in real user needs
                   and contexts. In this paper, we introduce a test
                   collection for the evaluation of CBIR algorithms. In the
                   test collection, the performance testing is based on
                   photograph similarity perceived by end-users in the
                   context of realistic illustration tasks and environment.
                   The building process and the characteristics of the
                   resulting test collection are outlined, including a
                   typology of similarity criteria expressed by the subjects
                   judging the similarity of photographs. A small-scale study
                   on the consistency of similarity assessments is presented.
                   A case evaluation of two CBIR algorithms is reported. The
                   results show clear correlation between the subjects'
                   similarity assessments and the functioning of feature
                   parameters of the tested algorithms.},
}
@article{Vakkari01,
   author       = {Pertti Vakkari},
   title        = {Changes in Search Tactics and Relevance Judgements when
                   Preparing a Research Proposal -- A Summary of the Findings
                   of a Longitudinal Study},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {4},
   pages        = {295--310},
   year         = {2001},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The article summarizes the empirical results on relations
                   between students' problem stages in the course of writing
                   their research proposals for their master's theses and the
                   information sought, choice of search terms and tactics and
                   relevance assessments of the information found for that
                   task. The study is based on Kuhlthau's model of the
                   information search process. The results of the study show
                   that there is a close connection between the students'
                   problem stages (mental model) in the task performance and
                   the information sought, search tactics used, and the
                   assessment of the relevance and utility of the information
                   found. The corroborated hypotheses extend and specify
                   ideas in Kuhlthau's model in the domain of IR. A theory of
                   task-based information searching based on the empirical
                   findings of the study is presented.},
}
@article{Kolcz02,
   author       = {Aleksander Ko{\l}cz and Joshua Alspector},
   title        = {Asymmetric Missing-data Problems: Overcoming the Lack of
                   Negative Data in Preference Ranking},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {5},
   pages        = {5--40},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In certain classification problems there is a strong a
                   asymmetry between the number of labeled examples available
                   for each of the classes involved. In an extreme case,
                   there may be a complete lack of labeled data for one of
                   the classes while, at the same time, there are adequate
                   labeled examples for the others, accompanied by a large
                   body of unlabeled data. Since most classification
                   algorithms require some information about all classes
                   involved, label estimation for the un-represented class is
                   desired. An important representative of this group of
                   problems is that of user interest/preference modeling
                   where there may be a large number of examples of what the
                   user likes with essentially no counterexamples.Recently,
                   there has been much interest in applying the EM algorithm
                   to incomplete data problems in the area of text retrieval
                   and categorization. We adapt this approach to the
                   asymmetric case of modeling user interests in news
                   articles, where only labeled positive training data are
                   available, with access to a large corpus of unlabeled
                   documents. User modeling is here equivalent to that of
                   user-specific document ranking. EM is used in conjunction
                   with the Naive Bayes model while its output is also
                   utilized by a Support Vector Machine and RocchioÕs
                   technique.Our findings demonstrate that the EM algorithm
                   can be quite effective in modeling the negative class
                   under a number of different initialization schemes.
                   Although primarily just the negative training examples are
                   needed, a natural question is whether using all of the
                   estimated labels (i.e., positive and negative) would be
                   more (or less) beneficial. This is important considering
                   that, in this context, the initialization of the negative
                   class for EM is likely not to be very accurate.
                   Experimental results suggest that EM output should be
                   limited to negative label estimates only.},
}
@article{Kuriyama02,
   author       = {Kazuko Kuriyama and Noriko Kando and Toshihiko Nozue and
                   Koji Eguchi},
   title        = {Pooling for a Large-Scale Test Collection: An Analysis of
                   the Search Results from the First {NTCIR} Workshop},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {5},
   pages        = {41--59},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We have conducted a study to: (1) verify the
                   exhaustiveness of pooling for the purpose of constructing
                   a large-scale test collection, and (2) examine whether a
                   difference in the number of pool documents can affect the
                   relative evaluation of IR systems. We carried out the
                   experiments using search topics, their relevance
                   assessments, and the search results that were submitted
                   for both the pre-test and test of the first NTCIR
                   Workshop.Our results verified the efficiency and the
                   effectiveness of the pooling method, the exhaustiveness of
                   the relevance assessments, and the reliability of the
                   evaluation using the test collection based on the pooling
                   method.},
}
@article{Chen02,
   author       = {Zhixiang Chen and Binhai Zhu},
   title        = {Some Formal Analysis of {R}occhioÕs Similarity-Based
                   Relevance Feedback Algorithm},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {5},
   pages        = {61--86},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {RocchioÕs similarity-based Relevance feedback algorithm,
                   one of the most important query reformation methods in
                   information retrieval, is essentially an adaptive
                   supervised learning algorithm from examples. In spite of
                   its popularity in various applications there is little
                   rigorous analysis of its learning complexity in
                   literature. In this paper we show that in the binary
                   vector space model, if the initial query vector is 0, then
                   for any of the four typical similarities (inner product,
                   dice coefficient, cosine coefficient, and Jaccard
                   coefficient), RocchioÕs similarity-based relevance
                   feedback algorithm makes at least n mistakes when used to
                   search for a collection of documents represented by a
                   monotone disjunction of at most k relevant features (or
                   terms) over the n-dimensional binary vector space ${0,
                   1}^n$. When an arbitrary initial query vector in {0, 1}^n
                   is used, it makes at least (n + k - 3)/2 mistakes to
                   search for the same collection of documents. The linear
                   lower bounds are independent of the choices of the
                   threshold and coefficients that the algorithm may use in
                   updating its query vector and making its classification.},
}
@article{Ruiz02,
   author       = {Miguel E. Ruiz and Padmini Srinivasan},
   title        = {Hierarchical Text Categorization Using Neural Networks},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {5},
   pages        = {87--118},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper presents the design and evaluation of a text
                   categorization method based on the Hierarchical Mixture of
                   Experts model. This model uses a divide and conquer
                   principle to define smaller categorization problems based
                   on a predefined hierarchical structure. The final
                   classifier is a hierarchical array of neural networks. The
                   method is evaluated using the UMLS Metathesaurus as the
                   underlying hierarchical structure, and the OHSUMED test
                   set of MEDLINE records. Comparisons with an optimized
                   version of the traditional RocchioÕs algorithm adapted for
                   text categorization, as well as flat neural network
                   classifiers are provided. The results show that the use of
                   the hierarchical structure improves text categorization
                   performance with respect to an equivalent flat model. The
                   optimized Rocchio algorithm achieves a performance
                   comparable with that of the hierarchical neural
                   networks.},
}
@article{Robertson02,
   author       = {Stephen Robertson},
   title        = {Introduction to the Special Issue: Overview of the TREC
                   Routing and Filtering Tasks},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {127--137},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper introduces the special issue, and reviews the
                   routing and filtering tasks as defined and evaluated at
                   TREC. The tasks attempt to simulate a specific service
                   situation: the system is assumed to process an incoming
                   stream of documents against profiles of user interest,
                   strictly in the time order in which they arrive, and
                   immediately refer any matching document to the user. In
                   the adaptive filtering version of the task, the user is
                   assumed to provide a relevance judgement instantly. The
                   rationale for the task definitions and the evaluation
                   measures used is discussed.},
}
@article{Allan02,
   author       = {James Allan},
   title        = {Detection As Multi-Topic Tracking},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {139--157},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The topic tracking task from TDT is a variant of
                   information filtering tasks that focuses on event-based
                   topics in streams of broadcast news. In this study, we
                   compare tracking to another TDT task, detection, which has
                   the goal of partitioning all arriving news into topics,
                   regardless of whether the topics are of interest to
                   anyone, and even when a new topic appears that had not
                   been previous anticipated. There are clear relationships
                   between the two tasks (under some assumptions, a
                   ``perfect'' tracking system could ``solve'' the detection
                   problem), but they are evaluated quite differently. We
                   describe the two tasks and discuss their similarities. We
                   show how viewing detection as a form of multi-topic
                   parallel tracking can illuminate the performance tradeoffs
                   of detection over tracking.},
}
@article{Ault02,
   author       = {Thomas G. Ault and Yiming Yang},
   title        = {Information Filtering in TREC-9 and TDT-3: A Comparative
                   Analysis},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {159--187},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Much work on automated information filtering has been
                   done in the TREC and TDT domains, but differences in
                   corpora, the nature of TREC topics vs. TDT events, the
                   constraints imposed on training and testing, and the
                   choices of performance measures confound any meaningful
                   comparison between these domains. We attempt to bridge the
                   gap between them by evaluating the performance of the
                   k-nearest-neighbor (kNN) classification system on the
                   corpus and categories from one domain using the
                   constraints of the other. To maximize comparability and
                   understand the effect of the evaluation metrics specific
                   to each domain, we optimize the performance of kNN
                   separately for the F1, T9P (preferred metric for TREC-9)
                   and Ctrk (official metric for TDT-3) metrics. Through a
                   thorough comparison of our within-domain and cross-domain
                   results, our results demonstrate that the corpus used for
                   TREC-9 is more challenging for an information filtering
                   system than the TDT-3 corpus and strongly suggest that the
                   TDT-3 event tracking task itself is more difficult than
                   the TREC batch filtering task. We also show that
                   optimizing performance in TREC-9 and TDT-3 tends to result
                   in systems with different performance characteristics,
                   confounding any meaningful comparison between the two
                   domains, and that T9P and Ctrk both have properties that
                   make them undesirable as general information filtering
                   metrics.},
}
@article{Soboroff02,
   author       = {Ian M. Soboroff and Charles K. Nicholas},
   title        = {Related, but not Relevant: Content-Based Collaborative
                   Filtering in TREC-8},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {189--208},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Historically, solutions to the TREC filtering tasks have
                   focused exclusively on the content of documents and search
                   topic descriptions as training data. These approaches are
                   well-known for their ability to focus on those salient
                   concepts in the document stream which are most useful for
                   separating relevant documents from irrelevant ones.
                   However, one kind of information that has not been used is
                   the relationships among the topics themselves. In our
                   TREC-8 routing experiments, we employed a collaborative
                   (or social) filtering algorithm, based on latent semantic
                   indexing which highlights common term usage patterns among
                   groups of filtering profiles. Our hypothesis was that this
                   would allow related topics to share common relevant
                   documents. We found, however, that the algorithm also
                   recommends many documents of related, yet irrelevant
                   interest. As a result of this process, many similar search
                   topics are ``linked'' together by common sets of documents
                   recommended to them. We visualize these topic
                   relationships using graphs where topics are nodes and
                   edges exist where two topics share a recommended
                   document.},
}
@article{Eichmann02,
   author       = {David Eichmann and Padmini Srinivasan},
   title        = {Adaptive Filtering of Newswire Stories using Two-Level
                   Clustering},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {209--237},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Adaptive filtering of news is an area of information
                   retrieval gaining substantial interest as services become
                   more available on the Internet. This paper reports on a
                   number of experiments involving a two-level clustering
                   approach using a variety of techniques including threshold
                   adaptation, topic vocabulary adaptation and both noun
                   phrase and named entity adaptation. Our goal in this
                   exploratory research is to empirically compare alternative
                   configurations of our filtering approach that will allow
                   us to better understand the relative value of the
                   component subsystems.},
}
@article{Robertson02a,
   author       = {Stephen Robertson},
   title        = {Threshold Setting and Performance Optimization in
                   Adaptive Filtering},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {239--256},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {An experimental adaptive filtering system, built on the
                   Okapi search engine, is described. In addition to the
                   regular text retrieval functions, the system requires a
                   complex set of procedures for setting score thresholds and
                   adapting them following feedback. These procedures need to
                   be closely related to the evaluation measures to be used.
                   A mixture of quantitative methods relating a threshold to
                   the number of documents expected to be retrieved in a time
                   period, and qualitative methods relating to the
                   probability of relevance, is defined. Experiments under
                   the TREC-9 Adaptive Filtering Track rules are reported.
                   The system is seen to perform reasonably well in
                   comparison with other systems at TREC. Some of the
                   variables that may affect performance are investigated.},
}
@article{Robertson02b,
   author       = {Stephen Robertson},
   title        = {Comparing the Performance of Adaptive Filtering and
                   Ranked Output Systems},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2--3},
   volume       = {5},
   pages        = {257--268},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Some insight into the behavior of adaptive filtering
                   systems may be gained by comparing them with similar
                   ranked-output retrieval systems. This is not easy;
                   however, a new optimization measure, introduced for the
                   TREC-9 filtering track, makes some such comparison
                   possible. A series of experiments using the TREC-9
                   filtering data shows that filtering effectiveness is
                   comparable to routing effectiveness, and demonstrates the
                   gains to be made from adaptation.},
}
@article{Herlocker02,
   author       = {Jon Herlocker and Joseph A. Konstan and John Riedl},
   title        = {An Empirical Analysis of Design Choices in
                   Neighborhood-Based Collaborative Filtering Algorithms},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {5},
   pages        = {287--310},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Collaborative filtering systems predict a user's interest
                   in new items based on the recommendations of other people
                   with similar interests. Instead of performing content
                   indexing or content analysis, collaborative filtering
                   systems rely entirely on interest ratings from members of
                   a participating community. Since predictions are based on
                   human ratings, collaborative filtering systems have the
                   potential to provide filtering based on complex
                   attributes, such as quality, taste, or aesthetics. Many
                   implementations of collaborative filtering apply some
                   variation of the neighborhood-based prediction algorithm.
                   Many variations of similarity metrics, weighting
                   approaches, combination measures, and rating normalization
                   have appeared in each implementation. For these parameters
                   and others, there is no consensus as to which choice of
                   technique is most appropriate for what situations, nor how
                   significant an effect on accuracy each parameter has.
                   Consequently, every person implementing a collaborative
                   filtering system must make hard design choices with little
                   guidance. This article provides a set of recommendations
                   to guide design of neighborhood-based prediction systems,
                   based on the results of an empirical study. We apply an
                   analysis framework that divides the neighborhood-based
                   prediction approach into three components and then
                   examines variants of the key parameters in each component.
                   The three components identified are similarity
                   computation, neighbor selection, and rating combination.},
}
@article{Nilsson02,
   author       = {Hierarchical Clustering Using Non-Greedy Principal
                   Direction Divisive Partitioning},
   title        = {Martin Nilsson},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {5},
   pages        = {311--321},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We present a non-greedy version of the recently published
                   Principal Direction Divisive Partitioning (PDDP)
                   algorithm. The PDDP algorithm creates a hierarchical
                   taxonomy of a data set by successively splitting the data
                   into sub-clusters. At each level the cluster with largest
                   variance is split by a hyper-plane orthogonal to its
                   leading principal component. The PDDP algorithm is known
                   to produce high quality clusters, especially when applied
                   to high dimensional data, such as document-word feature
                   matrices. It also scales well with both the size and the
                   dimensionality of the data set. However, at each level
                   only the locally optimal choice of spitting is considered.
                   At a later stage this often leads to a non-optimal global
                   partitioning of the data. The non-greedy version of the
                   PDDP algorithm (NGPDDP) presented in this paper address
                   this problem. At each level multiple alternative splitting
                   strategies are considered. Results from applying the
                   algorithm to generated and real data (feature vectors from
                   sets of text documents) are presented. The results show
                   substantial improvements in the cluster quality.},
}
@article{French02,
   author       = {James C. French and Allison L. Powell and Fredric Gey and
                   Natalia Perelman},
   title        = {Exploiting Manual Indexing to Improve Collection
                   Selection and Retrieval Effectiveness},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {5},
   pages        = {323--351},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Vocabulary incompatibilities arise when the terms used to
                   index a document collection are largely unknown, or at
                   least not well-known to the users who eventually search
                   the collection. No matter how comprehensive or
                   well-structured the indexing vocabulary, it is of little
                   use if it is not used effectively in query formulation.
                   This paper demonstrates that techniques for mapping user
                   queries into the controlled indexing vocabulary have the
                   potential to radically improve document retrieval
                   performance. We also show how the use of controlled
                   indexing vocabulary can be employed to achieve performance
                   gains for collection selection. Finally, we demonstrate
                   the potential benefit of combining these two techniques in
                   an interactive retrieval environment. Given a user query,
                   our evaluation approach simulates the human user's choice
                   of terms for query augmentation given a list of controlled
                   vocabulary terms suggested by a system. This strategy lets
                   us evaluate interactive strategies without the need for
                   human subjects.},
}
@article{Bookstein02,
   author       = {Abraham Bookstein and Vladimir A. Kulyukin and Timo
                   Raita},
   title        = {Generalized {H}amming Distance},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {5},
   pages        = {353--375},
   year         = {2002},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Many problems in information retrieval and related fields
                   depend on a reliable measure of the distance or similarity
                   between objects that, most frequently, are represented as
                   vectors. This paper considers vectors of bits. Such data
                   structures implement entities as diverse as bitmaps that
                   indicate the occurrences of terms and bitstrings
                   indicating the presence of edges in images. For such
                   applications, a popular distance measure is the Hamming
                   distance. The value of the Hamming distance for
                   information retrieval applications is limited by the fact
                   that it counts only exact matches, whereas in information
                   retrieval, corresponding bits that are close by can still
                   be considered to be almost identical. We define a
                   ``Generalized Hamming distance'' that extends the Hamming
                   concept to give partial credit for near misses, and
                   suggest a dynamic programming algorithm that permits it to
                   be computed efficiently. We envision many uses for such a
                   measure. In this paper we define and prove some basic
                   properties of the ``Generalized Hamming distance'', and
                   illustrate its use in the area of object recognition. We
                   evaluate our implementation in a series of experiments,
                   using autonomous robots to test the measure's
                   effectiveness in relating similar bitstrings.},
}
@article{Trotman03,
   author       = {Andrew Trotman},
   title        = {Compressing Inverted Files},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {6},
   pages        = {5--19},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Research into inverted file compression has focused on
                   compression ratioÑhow small the indexes can be.
                   Compression ratio is important for fast interactive
                   searching. It is taken as read, the smaller the index, the
                   faster the search. The premise ``smaller is better'' may
                   not be true. To truly build faster indexes it is often
                   necessary to forfeit compression. For inverted lists
                   consisting of only 128 occurrences compression may only
                   add overhead. Perhaps the inverted list could be stored in
                   128 bytes in place of 128 words, but it must still be
                   stored on disk. If the minimum disk sector read size is
                   512 bytes and the word size is 4 bytes, then both the
                   compressed and raw postings would require one disk seek
                   and one disk sector read. A less efficient compression
                   technique may increase the file size, but decrease
                   load/decompress time, thereby increasing throughput.
                   Examined here are five compression techniques, Golomb,
                   Elias gamma, Elias delta, Variable Byte Encoding and
                   Binary Interpolative Coding. The effect on file size, file
                   seek time, and file read time are all measured as is
                   decompression time. A quantitative measure of throughput
                   is developed and the performance of each method is
                   determined.},
}
@article{vanderPol03,
   author       = {Van Der Pol, Ruud},
   title        = {{Dipe-D}: A Tool for Knowledge-Based Query Formulation in
                   Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {6},
   pages        = {21--47},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The paper reports the development of Dipe-D, a
                   knowledge-based procedure for the formulation of Boolean
                   queries in information retrieval. Dipe-D creates a query
                   in two steps: (1) the user's information need is developed
                   interactively, while identifying the concepts of the
                   information need, and subsequently (2) the collection of
                   concepts identified is automatically transformed into a
                   Boolean query. In the first step, the subject areaÑas
                   represented in a knowledge baseÑis explored by the user.
                   He does this by means of specifying the (concepts that
                   meet his) information need in an artificial language and
                   looking through the solution as provided by the computer.
                   The specification language allows one to specify concepts
                   by their features, both in precise terms as well as
                   vaguely. By repeating the process of specifying the
                   information need and exploring the resulting concepts, the
                   user may precisely single out the concepts that describe
                   his information need. In the second step, the program
                   provides the designations (and variants) for the concepts
                   identified, and connects them by appropriate operators.
                   Dipe-D is meant to improve on existing procedures that
                   identify the concepts less systematically, create a query
                   manually, and then sometimes expand that query.
                   Experiments are reported on each of the two steps; they
                   indicate that the first step identifies only but not all
                   the relevant concepts, and the second step performs (at
                   least) as good as human beings do.},
}
@article{Sakkis03,
   author       = {Georgios Sakkis and Ion Androutsopoulos and Georgios
                   Paliouras and Vangelis Karkaletsis and Constantine D.
                   Spyropoulos and Panagiotis Stamatopoulos},
   title        = {A Memory-Based Approach to Anti-Spam Filtering for
                   Mailing Lists},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {6},
   pages        = {49--73},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper presents an extensive empirical evaluation of
                   memory-based learning in the context of anti-spam
                   filtering, a novel cost-sensitive application of text
                   categorization that attempts to identify automatically
                   unsolicited commercial messages that flood mailboxes.
                   Focusing on anti-spam filtering for mailing lists, a
                   thorough investigation of the effectiveness of a
                   memory-based anti-spam filter is performed using a
                   publicly available corpus. The investigation includes
                   different attribute and distance-weighting schemes, and
                   studies on the effect of the neighborhood size, the size
                   of the attribute set, and the size of the training corpus.
                   Three different cost scenarios are identified, and
                   suitable cost-sensitive evaluation functions are employed.
                   We conclude that memory-based anti-spam filtering for
                   mailing lists is practically feasible, especially when
                   combined with additional safety nets. Compared to a
                   previously tested Naive Bayes filter, the memory-based
                   filter performs on average better, particularly when the
                   misclassification cost for non-spam messages is high.},
}
@article{Elovici03,
   author       = {Yuval Elovici and Bracha Shapira and Paul B. Kantor},
   title        = {Using the Information Structure Model to Compare
                   Profile-Based Information Filtering Systems},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {6},
   pages        = {75--97},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {In the IR field it is clear that the value of a system
                   depends on the cost and benefit profiles of its users. It
                   would seem obvious that different users would prefer
                   different systems. In the TREC-9 filtering track, systems
                   are evaluated by a utility measure specifying a given cost
                   and benefit. However, in the study of decision systems it
                   is known that, in some cases, one system may be
                   unconditionally better than another. In this paper we
                   employ a decision theoretic approach to find conditions
                   under which an Information Filtering (IF) system is
                   unconditionally superior to another for all users
                   regardless of their cost and benefit profiles. It is well
                   known that if two IF systems have equal precision the
                   system with better recall will be preferred by all users.
                   Similarly, with equal recall, better precision is
                   universally preferred. We confirm these known results and
                   discover an unexpected dominance relation in which a
                   system with lower recall will be universally preferred
                   provided its precision is sufficiently higher.},
}
@article{Hawking03,
   author       = {David Hawking and Stephen Robertson},
   title        = {On Collection Size and Retrieval Effectiveness},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {6},
   pages        = {99--105},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The relationship between collection size and retrieval
                   effectiveness is particularly important in the context of
                   Web search. We investigate it first analytically and then
                   experimentally, using samples and subsets of test
                   collections. Different retrieval systems vary in how the
                   score assigned to an individual document in a sample
                   collection relates to the score it receives in the full
                   collection; we identify four cases. We apply signal
                   detection (SD) theory to retrieval from samples, taking
                   into account the four cases and using a variety of shapes
                   for relevant and irrelevant distributions. We note that
                   the SD model subsumes several earlier hypotheses about the
                   causes of the decreased precision in samples. We also
                   discuss other models which contribute to an understanding
                   of the phenomenon, particularly relating to the effects of
                   discreteness. Different models provide complementary
                   insights. Extensive use is made of test data, some from
                   official submissions to the TREC-6 VLC track and some new,
                   to illustrate the effects and test hypotheses. We
                   empirically confirm predictions, based on SD theory, that
                   P@n should decline when moving to a sample collection and
                   that average precision and R-precision should remain
                   constant. SD theory suggests the use of recall-fallout
                   plots as operating characteristic (OC) curves. We plot OC
                   curves of this type for a real retrieval system and query
                   set and show that curves for sample collections are
                   similar but not identical to the curve for the full
                   collection.},
}
@article{Lu03,
   author       = {Zhihong Lu and Kathryn S. McKinley},
   title        = {Partial Collection Replication for Information
                   Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {6},
   pages        = {159--198},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The explosion of content in distributed information
                   retrieval (IR) systems requires new mechanisms in order to
                   attain timely and accurate retrieval of unstructured text.
                   This paper shows how to exploit locality by building,
                   using, and searching partial replicas of text collections
                   in a distributed IR system. In this work, a partial
                   replica includes a subset of the documents from larger
                   collection(s) and the corresponding inference network
                   search mechanism. For each query, the distributed system
                   determines if partial replica is a good match and then
                   searches it, or it searches the original collection. We
                   demonstrate the scenarios where partial replication
                   performs better than systems that use caches which only
                   store previous query and answer pairs. We first use logs
                   from THOMAS and Excite to examine query locality using
                   query similarity versus exact match. We show that
                   searching replicas can improve locality (from 3 to 19\%)
                   over the exact match required by caching. Replicas
                   increase locality because they satisfy queries which are
                   distinct but return the same or very similar answers. We
                   then present a novel inference network replica selection
                   function. We vary its parameters and compare it to
                   previous collection selection functions, demonstrating a
                   configuration that directs most of the appropriate queries
                   to replicas in a replica hierarchy. We then explore the
                   performance of partial replication in a distributed IR
                   system. We compare it with caching and partitioning. Our
                   validated simulator shows that the increases in locality
                   due to replication make it preferable to caching alone,
                   and that even a small increase of 4\% in locality
                   translates into a performance advantage. We also show a
                   hybrid system with caches and replicas that performs
                   better than each on their own.},
}
@article{Mostafa03,
   author       = {Javed Mostafa and Snehasis Mukhopadhyay and Mathew
                   Palakal},
   title        = {Simulation Studies of Different Dimensions of Users'
                   Interests and their Impact on User Modeling and
                   Information Filtering},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {6},
   pages        = {199--223},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Modeling users in information filtering systems is a
                   difficult challenge due to dimensions such as nature,
                   scope, and variability of interests. Numerous
                   machine-learning approaches have been proposed for user
                   modeling in filtering systems. The focus has been
                   primarily on techniques for user model capture and
                   representation, with relatively simple assumptions made
                   about the type of users' interests. Although many studies
                   claim to deal with ``adaptive'' techniques and thus they
                   pay heed to the fact that different types of interests
                   must be modeled or even changes in interests have to be
                   captured, few studies have actually focused on the dynamic
                   nature and the variability of user-interests and their
                   impact on the modeling process. A simulation based
                   information filtering environment called SIMSFITER was
                   developed to overcome some of the barriers associated with
                   conducting studies on user-oriented factors that can
                   impact interests. SIMSIFTER implemented a user modeling
                   approach known as reinforcement learning that has proven
                   to be effective in previous filtering studies involving
                   humans. This paper reports on several studies conducted
                   using SIMSIFTER that examined the impact of key dimensions
                   such as type of interests, rate of change of interests and
                   level of user-involvement on modeling accuracy and
                   ultimately on filtering effectiveness.},
}
@article{Guo03,
   author       = {David Guo and Michael W. Berry and Bryan B. Thompson and
                   Sidney Bailin},
   title        = {Knowledge-Enhanced Latent Semantic Indexing},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {6},
   pages        = {225--250},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Latent Semantic Indexing (LSI) is a popular information
                   retrieval model for concept-based searching. As with many
                   vector space IR models, LSI requires an existing
                   term-document association structure such as a
                   term-by-document matrix. The term-by-document matrix,
                   constructed during document parsing, can only capture
                   weighted vocabulary occurrence patterns in the documents.
                   However, for many knowledge domains there are pre-existing
                   semantic structures that could be used to organize and
                   categorize information. The goals of this study are (i) to
                   demonstrate how such semantic structures can be
                   automatically incorporated into the LSI vector space
                   model, and (ii) to measure the effect of these structures
                   on query matching performance. The new approach, referred
                   to as Knowledge-Enhanced LSI, is applied to documents in
                   the OHSUMED medical abstracts collection using the
                   semantic structures provided by the UMLS Semantic Network
                   and MeSH. Results based on precision-recall data (11-point
                   average precision values) indicate that a MeSH-enhanced
                   search index is capable of delivering noticeable
                   incremental performance gain (as much as 35\%) over the
                   original LSI for modest constraints on precision. This
                   performance gain is achieved by replacing the original
                   query with the MeSH heading extracted from the query text
                   via regular expression matches.},
}
@article{Vechtomova03,
   author       = {Olga Vechtomova and Stephen Robertson and Susan Jones},
   title        = {Knowledge-Enhanced Latent Semantic Indexing},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {6},
   pages        = {251--273},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The paper presents two novel approaches to query
                   expansion with long-span collocatesÑwords, significantly
                   co-occurring in topic-size windows with query terms. In
                   the first approachÑglobal collocation analysisÑcollocates
                   of query terms are extracted from the entire collection,
                   in the secondÑlocal collocation analysisÑfrom a subset of
                   retrieved documents. The significance of association
                   between collocates was estimated using modified Mutual
                   Information and Z score. The techniques were tested using
                   the Okapi IR system. The effect of different parameters on
                   performance was evaluated: window size, number of
                   expansion terms, measures of collocation significance and
                   types of expansion terms. We present performance results
                   of these techniques and provide comparison with related
                   approaches.},
}
@article{Wang03,
   author       = {Quan Wang and Yiu-Kai Ng},
   title        = {An Ontology-Based Binary-Categorization Approach for
                   Recognizing Multiple-Record Web Documents Using a
                   Probabilistic Retrieval Model},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {6},
   pages        = {295--332},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The Web contains a tremendous amount of information. It
                   is challenging to determine which Web documents are
                   relevant to a user query, and even more challenging to
                   rank them according to their degrees of relevance. In this
                   paper, we propose a probabilistic retrieval model using
                   logistic regression for recognizing multiple-record Web
                   documents against an application ontology, a simple
                   conceptual modeling approach. We notice that many Web
                   documents contain a sequence of chunks of textual
                   information, each of which constitutes a ``record.'' This
                   type of documents is referred to as multiple-record
                   documents. In our categorization approach, a document is
                   represented by a set of term frequencies of index terms, a
                   density heuristic value, and a grouping heuristic value.
                   We first apply the logistic regression analysis on
                   relevant probabilities using the (i) index terms, (ii)
                   density value, and (iii) grouping value of each training
                   document. Hereafter, the relevant probability of each test
                   document is interpolated from the fitting curves. Contrary
                   to other probabilistic retrieval models, our model makes
                   only a weak independent assumption and is capable of
                   handling any important dependent relationships among index
                   terms. In addition, we use logistic regression, instead of
                   linear regression analysis, because the relevance
                   probabilities of training documents are discrete. Using a
                   test set of car-ads and another one for obituary Web
                   documents, our probabilistic model achieves the averaged
                   recall ratio of 100\%, precision ratio of 83.3\%, and
                   accuracy ratio of 92.5\%.},
}
@article{Huang03,
   author       = {Xiangji Huang and Fuchun Peng and Dale Schuurmans and
                   Nick Cercone and Stephen E. Robertson},
   title        = {Applying Machine Learning to Text Segmentation for
                   Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {6},
   pages        = {333--362},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We propose a self-supervised word segmentation technique
                   for text segmentation in Chinese information retrieval.
                   This method combines the advantages of traditional
                   dictionary based, character based and mutual information
                   based approaches, while overcoming many of their
                   shortcomings. Experiments on TREC data show this method is
                   promising. Our method is completely language independent
                   and unsupervised, which provides a promising avenue for
                   constructing accurate multi-lingual or cross-lingual
                   information retrieval systems that are flexible and
                   adaptive. We find that although the segmentation accuracy
                   of self-supervised segmentation is not as high as some
                   other segmentation methods, it is enough to give good
                   retrieval performance. It is commonly believed that word
                   segmentation accuracy is monotonically related to
                   retrieval performance in Chinese information retrieval.
                   However, for Chinese, we find that the relationship
                   between segmentation and retrieval performance is in fact
                   nonmonotonic; that is, at around 70\% word segmentation
                   accuracy an over-segmentation phenomenon begins to occur
                   which leads to a reduction in information retrieval
                   performance. We demonstrate this effect by presenting an
                   empirical investigation of information retrieval on
                   Chinese TREC data, using a wide variety of word
                   segmentation algorithms with word segmentation accuracies
                   ranging from 44\% to 95\%, including 70\% word
                   segmentation accuracy from our self-supervised
                   word-segmentation approach. It appears that the main
                   reason for the drop in retrieval performance is that
                   correct compounds and collocations are preserved by
                   accurate segmenters, while they are broken up by less
                   accurate (but reasonable) segmenters, to a surprising
                   advantage. This suggests that words themselves might be
                   too broad a notion to conveniently capture the general
                   semantic meaning of Chinese text. Our research suggests
                   machine learning techniques can play an important role in
                   building adaptable information retrieval systems and
                   different evaluation standards for word segmentation
                   should be given to different applications.},
}
@article{Nottelmann03,
   author       = {Henrik Nottelmann and Norbert Fuhr},
   title        = {From Retrieval Status Values to Probabilities of
                   Relevance for Advanced IR Applications},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {6},
   pages        = {363--388},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Information Retrieval systems typically sort the result
                   with respect to document retrieval status values (RSV).
                   According to the Probability Ranking Principle, this
                   ranking ensures optimum retrieval quality if the RSVs are
                   monotonously increasing with the probabilities of
                   relevance (as e.g. for probabilistic IR models). However,
                   advanced applications like filtering or distributed
                   retrieval require estimates of the actual probability of
                   relevance. The relationship between the RSV of a document
                   and its probability of relevance can be described by a
                   ``normalisation'' function which maps the retrieval status
                   value onto the probability of relevance (``mapping
                   functions''). In this paper, we explore the use of linear
                   and logistic mapping functions for different retrieval
                   methods. In a series of upper-bound experiments, we
                   compare the approximation quality of the different mapping
                   functions. We also investigate the effect on the resulting
                   retrieval quality in distributed retrieval (only merging,
                   without resource selection). These experiments show that
                   good estimates of the actual probability of relevance can
                   be achieved, and that the logistic model outperforms the
                   linear one. Retrieval quality for distributed retrieval is
                   only slightly improved by using the logistic function.},
}
@article{Peters04,
   author       = {Carol Peters and Martin Braschler},
   title        = {Editorial},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {5},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Braschler04,
   author       = {Martin Braschler and Carol Peters},
   title        = {{Cross-Language Evaluation Forum}: Objectives, Results,
                   Achievements},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {7--31},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Hollink04,
   author       = {Vera Hollink and Jaap Kamps and Christof Monz and De
                   Rijke, Maarten},
   title        = {Monolingual Document Retrieval for {E}uropean Languages},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {33--52},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Bertoldi04,
   author       = {Nicola Bertoldi and Marcello Federico},
   title        = {Statistical Models for Monolingual and Bilingual
                   Information Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {53--72},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{McNamee04,
   author       = {Paul McNamee and James Mayfield},
   title        = {Character N-Gram Tokenization for {E}uropean Language
                   Text Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {73--97},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Hedlund04,
   author       = {Turid Hedlund and Eija Airio and Heikki Keskustalo and
                   Raija Lehtokangas and Ari Pirkola and Kalervo
                   J{\"{a}}rvelin},
   title        = {Dictionary-Based Cross-Language Information Retrieval:
                   Learning Experiences from CLEF 2000Ð2002},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {99--119},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Savoy04,
   author       = {Jacques Savoy},
   title        = {Combining Multiple Strategies for Effective Monolingual
                   and Cross-Language Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {121--148},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Chen04,
   author       = {Aitao Chen and Fredric C. Gey},
   title        = {Multilingual Information Retrieval Using Machine
                   Translation, Relevance Feedback and Decompounding},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {149--182},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Braschler04a,
   author       = {Martin Braschler},
   title        = {Combination Approaches for Multilingual Text Retrieval},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {183--204},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Oard04,
   author       = {Douglas W. Oard and Julio Gonzalo and Mark Sanderson and
                   Fernando L{\'{o}}pez-Ostenero and Jianqiang Wang},
   title        = {Interactive Cross-Language Document Selection},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1/2},
   volume       = {7},
   pages        = {205--228},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Sebastiani04,
   author       = {Fabrizio Sebastiani},
   title        = {Introduction: Special Issue on the {25th European
                   Conference on Information Retrieval Research}},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {235--237},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Gurrin04,
   author       = {Cathal Gurrin and Alan F. Smeaton},
   title        = {Replicating {W}eb Structure in Small-Scale Test
                   Collections},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {239--263},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Harper04,
   author       = {David J. Harper and Ivan Koychev and Yixing Sun and Iain
                   Pirie},
   title        = {Within-Document Retrieval: A User-Centred Evaluation of
                   Relevance Profiling},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {265--290},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Braschler04b,
   author       = {Martin Braschler and B{\"{a}}rbel Ripplinger},
   title        = {How Effective is Stemming and Decompounding for {G}erman
                   Text Retrieval?},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {291--316},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Peng04,
   author       = {Fuchun Peng and Dale Schuurmans and Shaojun Wang},
   title        = {Augmenting Naive {B}ayes Classifiers with Statistical
                   Language Models},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {317--345},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Makkonen04,
   author       = {Juha Makkonen and Helena Ahonen-Myka and Marko
                   Salmenkivi},
   title        = {Simple Semantics in Topic Detection and Tracking},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {347--368},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Lin04,
   author       = {Raz Lin and Sarit Kraus and Jeffrey Tew},
   title        = {OSGS--A Personalized Online Store for E-Commerce
                   Environments},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {369--394},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Cheung04,
   author       = {Learning User Similarity and Rating Style for
                   Collaborative Recommendation},
   title        = {Kwok-Wai Cheung and Lily F. Tian},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3/4},
   volume       = {7},
   pages        = {395--410},
   year         = {2004},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{Spinellis05,
   author       = {Diomidis Spinellis},
   title        = {Index-Based Persistent Document Identifiers},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {5--24},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The infrastructure of a typical search engine can be used
                   to calculate and resolve persistent document identifiers:
                   a string that can uniquely identify and locate a document
                   on the Internet without reference to its original location
                   (URL). Bookmarking a document using such an identifier
                   allows its retrieval even if the document's URL, and, in
                   many cases, its contents change. Web client applications
                   can offer facilities for users to bookmark a page by
                   reference to a search engine and the persistent identifier
                   instead of the original URL. The identifiers are
                   calculated using a global Internet term index; a
                   document's unique identifier consists of a word or word
                   combination that occurs uniquely in the specific document.
                   We use a genetic algorithm to locate a minimal unique
                   document identifier: the shortest word or word combination
                   that will locate the document. We tested our approach by
                   implementing tools for indexing a document collection,
                   calculating the persistent identifiers, performing
                   queries, and distributing the computation and storage load
                   among many computers.},
}
@article{Geffet05,
   author       = {Maayan Geffet and Yair Wiseman and Dror Feitelson},
   title        = {Automatic Alphabet Recognition},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {25--40},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The last step of the Information Retrieval process is to
                   display the found documents to the user. However, some
                   difficulties might occur at that point. English texts are
                   usually written in the ASCII standard. Unlike the English
                   language, many languages have different character sets,
                   and do not have one standard. This plurality of standards
                   causes problems, especially in a web environment, where
                   one may download a document with an unknown standard. This
                   paper suggests a purely automatic way of finding the
                   standard which was used by the document writer based on
                   the statistical letters distribution in the language. We
                   developed a vector-space-based method that creates
                   frequencies vectors for each letter of the language and
                   then matches a new document's vectors to the pre-computed
                   templates. The algorithm was applied on various types of
                   corpora in Hebrew, Russian and English, and provides an
                   efficient solution to the stated problem in most cases.},
}
@article{Xu05,
   author       = {Yunjie Xu and Michel Benaroch},
   title        = {Information Retrieval with a Hybrid Automatic Query
                   Expansion and Data Fusion Procedure},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {41--65},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We propose a hybrid information retrieval (IR) procedure
                   that builds on two well-known IR approaches: data fusion
                   and query expansion via relevance feedback. This IR
                   procedure is designed to exploit the strengths of data
                   fusion and relevance feedback and to avoid some weaknesses
                   of these approaches. We show that our IR procedure is
                   built on postulates that can be justified analytically and
                   empirically. Additionally, we offer an empirical
                   investigation of the procedure, showing that it is
                   superior to relevance feedback on some dimensions and
                   comparable on other dimensions. The empirical
                   investigation also verifies the conditions under which the
                   use of our IR procedure could be beneficial.},
}
@article{Bennett05,
   author       = {Paul N. Bennett and Susan T. Dumais and Eric Horvitz},
   title        = {The Combination of Text Classifiers Using Reliability
                   Indicators},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {67--100},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {The intuition that different text classifiers behave in
                   qualitatively different ways has long motivated attempts
                   to build a better metaclassifier via some combination of
                   classifiers. We introduce a probabilistic method for
                   combining classifiers that considers the context-sensitive
                   reliabilities of contributing classifiers. The method
                   harnesses reliability indicatorsÑvariables that provide
                   signals about the performance of classifiers in different
                   situations. We provide background, present procedures for
                   building metaclassifiers that take into consideration both
                   reliability indicators and classifier outputs, and review
                   a set of comparative studies undertaken to evaluate the
                   methodology.},
}
@article{Liggett05,
   author       = {Walter Liggett and Chris Buckley},
   title        = {System Performance and Natural Language Expression of
                   Information Needs},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {101--128},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Consider information retrieval systems that respond to a
                   query (a natural language statement of a topic, an
                   information need) with an ordered list of 1000 documents
                   from the document collection. From the responses to
                   queries that all express the same topic, one can discern
                   how the words associated with a topic result in particular
                   system behavior. From what is discerned from different
                   topics, one can hypothesize abstract topic factors that
                   influence system performance. An example of such a factor
                   is the specificity of the topic's primary key word. This
                   paper shows that statements about the effect of abstract
                   topic factors on system performance can be supported
                   empirically. A combination of statistical methods is
                   applied to system responses from NIST's Text REtrieval
                   Conference. We analyze each topic using a measure of
                   irrelevant-document exclusion computed for each response
                   and a measure of dissimilarity between relevant-document
                   return orders computed for each pair of responses. We
                   formulate topic factors through graphical comparison of
                   measurements for different topics. Finally, we propose for
                   each topic a four-dimensional summarization that we use to
                   select topic comparisons likely to depict topic factors
                   clearly.},
}
@article{Lemire05,
   author       = {Daniel Lemire},
   title        = {Scale and Translation Invariant Collaborative Filtering
                   Systems},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {129--150},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {Collaborative filtering systems are prediction algorithms
                   over sparse data sets of user preferences. We modify a
                   wide range of state-of-the-art collaborative filtering
                   systems to make them scale and translation invariant and
                   generally improve their accuracy without increasing their
                   computational cost. Using the EachMovie and the Jester
                   data sets, we show that learning-free constant time scale
                   and translation invariant schemes outperforms other
                   learning-free constant time schemes by at least 3\% and
                   perform as well as expensive memory-based schemes (within
                   4\%). Over the Jester data set, we show that a scale and
                   translation invariant Eigentaste algorithm outperforms
                   Eigentaste 2.0 by 20\%. These results suggest that scale
                   and translation invariance is a desirable property.},
}
@article{Anh05,
   author       = {Inverted Index Compression Using Word-Aligned Binary
                   Codes},
   title        = {Vo {Ngoc Anh} and Alistair Moffat},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {8},
   pages        = {151--166},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {We examine index representation techniques for
                   document-based inverted files, and present a mechanism for
                   compressing them using word-aligned binary codes. The new
                   approach allows extremely fast decoding of inverted lists
                   during query processing, while providing compression rates
                   better than other high-throughput representations. Results
                   are given for several large text collections in support of
                   these claims, both for compression effectiveness and query
                   efficiency.},
}
@article{05,
   author       = {},
   title        = {},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {2},
   volume       = {8},
   pages        = {},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{05,
   author       = {},
   title        = {},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {3},
   volume       = {8},
   pages        = {},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}
@article{05,
   author       = {},
   title        = {},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {4},
   volume       = {8},
   pages        = {},
   year         = {2005},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {},
}

