@article{d23416d3f7ce4bd1b1808c45acae859f,
title = "BSQA: Integrated text mining using entity relation semantics extracted from biological literature of insects",
abstract = "Text mining is one promising way of extracting information automatically from the vast biological literature. To maximize its potential, the knowledge encoded in the text should be translated to some semantic representation such as entities and relations, which could be analyzed by machines. But large-scale practical systems for this purpose are rare. We present BeeSpace question/answering (BSQA) system that performs integrated text mining for insect biology, covering diverse aspects from molecular interactions of genes to insect behavior. BSQA recognizes a number of entities and relations in Medline documents about the model insect, Drosophila melanogaster. For any text query, BSQA exploits entity annotation of retrieved documents to identify important concepts in different categories. By utilizing the extracted relations, BSQA is also able to answer many biologically motivated questions, from simple ones such as, which anatomical part is a gene expressed in, to more complex ones involving multiple types of relations. BSQA is freely available at http://www .beespace.uiuc.edu/QuestionAnswer.",
author = "Xin He and Yanen Li and Radhika Khetani and Barry Sanders and Yue Lu and Xu Ling and Zhai, {Cheng Xiang} and Bruce Schatz",
note = "Funding Information: Funding for open access charge: Frontiers of Integrative Biological Research program (grant 0425852) entitled BeeSpace: An Interactive Environment for Analyzing the Nature-Nurture in Societal Roles. Funding Information: BeeSpace is the flagship bioinformatics project in the National Science Foundation (NSF) Frontiers of Integrative Biological Research (FIBR) program, see www.beespace.uiuc.edu. The overall goal of BeeSpace is to develop new technologies for functional analysis of genes related to insect behavior, particularly focusing on the honey bee (12). In this work, we present a text mining system for insect biology, as part of BeeSpace. The core component of our BeeSpace question/answering (BSQA) system is the extraction of knowledge in the literature, in the form of various entities, such as genes and anatomical parts, and their inter-relationships. Built on top of this rich representation are two different ways of extracting information. First, for a text query, we automatically identify and rank the entities that appear in the retrieved documents. The ranked list, thus, serves as a compact summary of the documents. As one scenario, a user may query for a biological process, and the returned gene list would suggest genes likely involved in this process. Second, the various relations we recognize from literature are organized in a relational database, and we support a number of queries on this database. Thus a question from a user, such as, {\textquoteleft}in what anatomical part is a gene expressed{\textquoteright} can be formulated and executed as a structured query language (SQL) query. By utilizing both statistical patterns of entities (our first subsystem) and semantic relations (our second subsystem), we combine the strengths of IR and IE techniques to provide maximum flexibility of information access. Meanwhile, by integrating information on a number of entities and relations, our system enables a user to ask his or her questions from different perspectives.",
year = "2010",
month = jun,
day = "21",
doi = "10.1093/nar/gkq544",
language = "English (US)",
volume = "38",
pages = "W175--W181",
journal = "Nucleic acids research",
issn = "0305-1048",
publisher = "Oxford University Press",
number = "SUPPL. 2",
}