@article{9f7673b6479d47d2b518470b0e867d38,
title = "The performance of coalescent-based species tree estimation methods under models of missing data",
abstract = "Background: Estimation of species trees from multiple genes is complicated by processes such as incomplete lineage sorting, gene duplication and loss, and horizontal gene transfer, that result in gene trees that differ from each other and from the species phylogeny. Methods to estimate species trees in the presence of gene tree discord due to incomplete lineage sorting have been developed and proved to be statistically consistent when gene tree discord is due only to incomplete lineage sorting and every gene tree includes the full set of species. Results: We establish statistical consistency of certain coalescent-based species tree estimation methods under some models of taxon deletion from genes. We also evaluate the impact of missing data on four species tree estimation methods (ASTRAL-II, ASTRID, MP-EST, and SVDquartets) using simulated datasets with varying levels of incomplete lineage sorting, gene tree estimation error, and degrees/patterns of missing data. Conclusions: All the species tree estimation methods improved in accuracy as the number of genes increased and often produced highly accurate species trees even when the amount of missing data was large. These results together indicate that accurate species tree estimation is possible under a variety of conditions, even when there are substantial amounts of missing data.",
keywords = "ASTRAL, ASTRID, Incomplete lineage sorting, MP-EST, Missing data, Multi-species coalescent, SVDquartets, Species tree",
author = "Michael Nute and Jed Chou and Molloy, {Erin K.} and Tandy Warnow",
note = "Funding Information: Acknowledgements We thank the reviewers of our RECOMB-CG paper for their suggestions, which led to improvements in the quality of this paper. Funding This work was supported by the National Science Foundation (grant numbers CCF:1535977 to TW). MN was supported by a fellowship from the CompGen initiative in the Coordinated Science Laboratory at University of Illinois at Urbana-Champaign. JC was supported by the Department of Mathematics at University of Illinois at Urbana-Champaign. EKM was supported by the National Science Foundation (grant number DGE-1144245 to EKM). This research was part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (grant numbers OCI-0725070 and ACI-1238993) and the state of Illinois. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications. This work made use of the Illinois Campus Cluster, a computing resource that is operated by the Illinois Campus Cluster Program in conjunction with the National Center for Supercomputing Applications and which is supported by funds from the University of Illinois at Urbana-Champaign. The publication cost of this article was funded by NSF grant CCF-1535977. Funding Information: This work was supported by the National Science Foundation (grant numbers CCF:1535977 to TW). MN was supported by a fellowship from the CompGen initiative in the Coordinated Science Laboratory at University of Illinois at Urbana-Champaign. JC was supported by the Department of Mathematics at University of Illinois at Urbana-Champaign. EKM was supported by the National Science Foundation (grant number DGE-1144245 to EKM). This research was part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (grant numbers OCI-0725070 and ACI-1238993) and the state of Illinois. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications. This work made use of the Illinois Campus Cluster, a computing resource that is operated by the Illinois Campus Cluster Program in conjunction with the National Center for Supercomputing Applications and which is supported by funds from the University of Illinois at Urbana-Champaign. The publication cost of this article was funded by NSF grant CCF-1535977. Publisher Copyright: {\textcopyright} 2018 The Author(s).",
year = "2018",
month = may,
day = "8",
doi = "10.1186/s12864-018-4619-8",
language = "English (US)",
volume = "19",
journal = "BMC Genomics",
issn = "1471-2164",
publisher = "BioMed Central",
}