@article{Lin-2021-Pyserini:,
title = "Pyserini: A Python Toolkit for Reproducible Information Retrieval Research with Sparse and Dense Representations",
author = "Lin, Jimmy and
Ma, Xueguang and
Lin, Sheng-Chieh and
Yang, Jheng-Hong and
Pradeep, Ronak and
Nogueira, Rodrigo and
Lin, Jimmy and
Ma, Xueguang and
Lin, Sheng-Chieh and
Yang, Jheng-Hong and
Pradeep, Ronak and
Nogueira, Rodrigo",
journal = "Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval",
year = "2021",
publisher = "ACM",
url = "https://gwf-uwaterloo.github.io/gwf-publications/G21-16001",
doi = "10.1145/3404835.3463238",
abstract = "Pyserini is a Python toolkit for reproducible information retrieval research with sparse and dense representations. It aims to provide effective, reproducible, and easy-to-use first-stage retrieval in a multi-stage ranking architecture. Our toolkit is self-contained as a standard Python package and comes with queries, relevance judgments, pre-built indexes, and evaluation scripts for many commonly used IR test collections. We aim to support, out of the box, the entire research lifecycle of efforts aimed at improving ranking with modern neural approaches. In particular, Pyserini supports sparse retrieval (e.g., BM25 scoring using bag-of-words representations), dense retrieval (e.g., nearest-neighbor search on transformer-encoded representations), as well as hybrid retrieval that integrates both approaches. This paper provides an overview of toolkit features and presents empirical results that illustrate its effectiveness on two popular ranking tasks. Around this toolkit, our group has built a culture of reproducibility through shared norms and tools that enable rigorous automated testing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="Lin-2021-Pyserini:">
<titleInfo>
<title>Pyserini: A Python Toolkit for Reproducible Information Retrieval Research with Sparse and Dense Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xueguang</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng-Chieh</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jheng-Hong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronak</namePart>
<namePart type="family">Pradeep</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rodrigo</namePart>
<namePart type="family">Nogueira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>ACM</publisher>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Pyserini is a Python toolkit for reproducible information retrieval research with sparse and dense representations. It aims to provide effective, reproducible, and easy-to-use first-stage retrieval in a multi-stage ranking architecture. Our toolkit is self-contained as a standard Python package and comes with queries, relevance judgments, pre-built indexes, and evaluation scripts for many commonly used IR test collections. We aim to support, out of the box, the entire research lifecycle of efforts aimed at improving ranking with modern neural approaches. In particular, Pyserini supports sparse retrieval (e.g., BM25 scoring using bag-of-words representations), dense retrieval (e.g., nearest-neighbor search on transformer-encoded representations), as well as hybrid retrieval that integrates both approaches. This paper provides an overview of toolkit features and presents empirical results that illustrate its effectiveness on two popular ranking tasks. Around this toolkit, our group has built a culture of reproducibility through shared norms and tools that enable rigorous automated testing.</abstract>
<identifier type="citekey">Lin-2021-Pyserini:</identifier>
<identifier type="doi">10.1145/3404835.3463238</identifier>
<location>
<url>https://gwf-uwaterloo.github.io/gwf-publications/G21-16001</url>
</location>
<part>
<date>2021</date>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Pyserini: A Python Toolkit for Reproducible Information Retrieval Research with Sparse and Dense Representations
%A Lin, Jimmy
%A Ma, Xueguang
%A Lin, Sheng-Chieh
%A Yang, Jheng-Hong
%A Pradeep, Ronak
%A Nogueira, Rodrigo
%J Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval
%D 2021
%I ACM
%F Lin-2021-Pyserini:
%X Pyserini is a Python toolkit for reproducible information retrieval research with sparse and dense representations. It aims to provide effective, reproducible, and easy-to-use first-stage retrieval in a multi-stage ranking architecture. Our toolkit is self-contained as a standard Python package and comes with queries, relevance judgments, pre-built indexes, and evaluation scripts for many commonly used IR test collections. We aim to support, out of the box, the entire research lifecycle of efforts aimed at improving ranking with modern neural approaches. In particular, Pyserini supports sparse retrieval (e.g., BM25 scoring using bag-of-words representations), dense retrieval (e.g., nearest-neighbor search on transformer-encoded representations), as well as hybrid retrieval that integrates both approaches. This paper provides an overview of toolkit features and presents empirical results that illustrate its effectiveness on two popular ranking tasks. Around this toolkit, our group has built a culture of reproducibility through shared norms and tools that enable rigorous automated testing.
%R 10.1145/3404835.3463238
%U https://gwf-uwaterloo.github.io/gwf-publications/G21-16001
%U https://doi.org/10.1145/3404835.3463238
Markdown (Informal)
[Pyserini: A Python Toolkit for Reproducible Information Retrieval Research with Sparse and Dense Representations](https://gwf-uwaterloo.github.io/gwf-publications/G21-16001) (Lin et al., GWF 2021)
ACL
- Jimmy Lin, Xueguang Ma, Sheng-Chieh Lin, Jheng-Hong Yang, Ronak Pradeep, Rodrigo Nogueira, Jimmy Lin, Xueguang Ma, Sheng-Chieh Lin, Jheng-Hong Yang, Ronak Pradeep, and Rodrigo Nogueira. 2021. Pyserini: A Python Toolkit for Reproducible Information Retrieval Research with Sparse and Dense Representations. Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval.