-
Notifications
You must be signed in to change notification settings - Fork 2
/
tinydb_for_sampling.json
1 lines (1 loc) · 41.4 KB
/
tinydb_for_sampling.json
1
{"_default": {"1": {"Abstract": "We present BLESS, a comprehensive performance benchmark of the most recent state-of-the-art large language models (LLMs) on the task of text simplification (TS). We examine how well off-the-shelf LLMs can solve this challenging task, assessing a total of 44 models, differing in size, architecture, pre-training methods, and accessibility, on three test sets from different domains (Wikipedia, news, and medical) under a few-shot setting. Our analysis considers a suite of automatic metrics as well as a large-scale quantitative investigation into the types of common edit operations performed by the different models. Furthermore, we perform a manual qualitative analysis on a subset of model outputs to better gauge the quality of the generated simplifications. Our evaluation indicates that the best LLMs, despite not being trained on TS, perform comparably with state-of-the-art TS baselines. Additionally, we find that certain LLMs demonstrate a greater range and diversity of edit operations. Our performance benchmark will be available as a resource for the development of future TS methods and evaluation metrics.", "ArxivID": "2310.15773v1", "Authors": [{"Affiliations": null, "ForeName": null, "FullName": "Tannon Kew", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Alison Chi", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Laura V\u00e1squez-Rodr\u00edguez", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Sweta Agrawal", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Dennis Aumiller", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Fernando Alva-Manchego", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Matthew Shardlow", "HashID": null, "LastName": null, "ORCID": null}], "CiteCrawlerDeep": 1, "CitedBy": null, "DOI": null, "FlagAffiliationMining": 0, "FlagEmbedding": 0, "FlagExtractKG": 0, "FlagExtractTopic": 0, "InsertType": null, "Journal": "Arxiv", "Keywords": null, "NamedEntities": null, "OreginalArticle": {"arxiv:comment": {"#text": "This paper has been accepted to EMNLP 2023 as a main long paper. 9\n pages, 7 figures", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "arxiv:primary_category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "author": [{"name": "Tannon Kew"}, {"name": "Alison Chi"}, {"name": "Laura V\u00e1squez-Rodr\u00edguez"}, {"name": "Sweta Agrawal"}, {"name": "Dennis Aumiller"}, {"name": "Fernando Alva-Manchego"}, {"name": "Matthew Shardlow"}], "category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL"}, "id": "http:https://arxiv.org/abs/2310.15773v1", "link": [{"@href": "http:https://arxiv.org/abs/2310.15773v1", "@rel": "alternate", "@type": "text/html"}, {"@href": "http:https://arxiv.org/pdf/2310.15773v1", "@rel": "related", "@title": "pdf", "@type": "application/pdf"}], "published": "2023-10-24T12:18:17Z", "summary": "We present BLESS, a comprehensive performance benchmark of the most recent\nstate-of-the-art large language models (LLMs) on the task of text\nsimplification (TS). We examine how well off-the-shelf LLMs can solve this\nchallenging task, assessing a total of 44 models, differing in size,\narchitecture, pre-training methods, and accessibility, on three test sets from\ndifferent domains (Wikipedia, news, and medical) under a few-shot setting. Our\nanalysis considers a suite of automatic metrics as well as a large-scale\nquantitative investigation into the types of common edit operations performed\nby the different models. Furthermore, we perform a manual qualitative analysis\non a subset of model outputs to better gauge the quality of the generated\nsimplifications. Our evaluation indicates that the best LLMs, despite not being\ntrained on TS, perform comparably with state-of-the-art TS baselines.\nAdditionally, we find that certain LLMs demonstrate a greater range and\ndiversity of edit operations. Our performance benchmark will be available as a\nresource for the development of future TS methods and evaluation metrics.", "title": "BLESS: Benchmarking Large Language Models on Sentence Simplification", "updated": "2023-10-24T12:18:17Z"}, "PMC": null, "PMID": null, "Published": "2023-10-24T12:18:17+00:00", "QueryTranslation": "ArXiv Query: search_query=ti:\"large language model\" AND ti:Benchmark AND abs:medical&id_list=&start=0&max_results=10", "ReferenceCrawlerDeep": 1, "References": null, "SourceBank": 2, "State": 5, "Title": "BLESS: Benchmarking Large Language Models on Sentence Simplification", "Topics": null, "FullTextMetadata": {"ConverterType": "unstructured", "TokenCount": 10332}}, "2": {"Abstract": "METHODS: First, a set of evaluation criteria is designed based on a comprehensive literature review. Second, existing candidate criteria are optimized for using a Delphi method by five experts in medicine and engineering. Third, three clinical experts design a set of medical datasets to interact with LLMs. Finally, benchmarking experiments are conducted on the datasets. The responses generated by chatbots based on LLMs are recorded for blind evaluations by five licensed medical experts. RESULTS: The obtained evaluation criteria cover medical professional capabilities, social comprehensive capabilities, contextual capabilities, and computational robustness, with sixteen detailed indicators. The medical datasets include twenty-seven medical dialogues and seven case reports in Chinese. Three chatbots are evaluated, ChatGPT by OpenAI, ERNIE Bot by Baidu Inc., and Doctor PuJiang (Dr. PJ) by Shanghai Artificial Intelligence Laboratory. Experimental results show that Dr. PJ outperforms ChatGPT and ERNIE Bot in both multiple-turn medical dialogue and case report scenarios.", "ArxivID": "2305.07340v1", "Authors": [{"Affiliations": null, "ForeName": null, "FullName": "Jie Xu", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Lu Lu", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Sen Yang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Bilin Liang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Xinwei Peng", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Jiali Pang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Jinru Ding", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Xiaoming Shi", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Lingrui Yang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Huan Song", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Kang Li", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Xin Sun", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Shaoting Zhang", "HashID": null, "LastName": null, "ORCID": null}], "CiteCrawlerDeep": 1, "CitedBy": null, "DOI": null, "FlagAffiliationMining": 0, "FlagEmbedding": 0, "FlagExtractKG": 0, "FlagExtractTopic": 0, "InsertType": null, "Journal": "Arxiv", "Keywords": null, "NamedEntities": null, "OreginalArticle": {"arxiv:primary_category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "author": [{"name": "Jie Xu"}, {"name": "Lu Lu"}, {"name": "Sen Yang"}, {"name": "Bilin Liang"}, {"name": "Xinwei Peng"}, {"name": "Jiali Pang"}, {"name": "Jinru Ding"}, {"name": "Xiaoming Shi"}, {"name": "Lingrui Yang"}, {"name": "Huan Song"}, {"name": "Kang Li"}, {"name": "Xin Sun"}, {"name": "Shaoting Zhang"}], "category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL"}, "id": "http:https://arxiv.org/abs/2305.07340v1", "link": [{"@href": "http:https://arxiv.org/abs/2305.07340v1", "@rel": "alternate", "@type": "text/html"}, {"@href": "http:https://arxiv.org/pdf/2305.07340v1", "@rel": "related", "@title": "pdf", "@type": "application/pdf"}], "published": "2023-05-12T09:37:13Z", "summary": "METHODS: First, a set of evaluation criteria is designed based on a\ncomprehensive literature review. Second, existing candidate criteria are\noptimized for using a Delphi method by five experts in medicine and\nengineering. Third, three clinical experts design a set of medical datasets to\ninteract with LLMs. Finally, benchmarking experiments are conducted on the\ndatasets. The responses generated by chatbots based on LLMs are recorded for\nblind evaluations by five licensed medical experts. RESULTS: The obtained\nevaluation criteria cover medical professional capabilities, social\ncomprehensive capabilities, contextual capabilities, and computational\nrobustness, with sixteen detailed indicators. The medical datasets include\ntwenty-seven medical dialogues and seven case reports in Chinese. Three\nchatbots are evaluated, ChatGPT by OpenAI, ERNIE Bot by Baidu Inc., and Doctor\nPuJiang (Dr. PJ) by Shanghai Artificial Intelligence Laboratory. Experimental\nresults show that Dr. PJ outperforms ChatGPT and ERNIE Bot in both\nmultiple-turn medical dialogue and case report scenarios.", "title": "MedGPTEval: A Dataset and Benchmark to Evaluate Responses of Large\n Language Models in Medicine", "updated": "2023-05-12T09:37:13Z"}, "PMC": null, "PMID": null, "Published": "2023-05-12T09:37:13+00:00", "QueryTranslation": "ArXiv Query: search_query=ti:\"large language model\" AND ti:Benchmark AND abs:medical&id_list=&start=0&max_results=10", "ReferenceCrawlerDeep": 1, "References": null, "SourceBank": 2, "State": 5, "Title": "MedGPTEval: A Dataset and Benchmark to Evaluate Responses of Large Language Models in Medicine", "Topics": null, "FullTextMetadata": {"ConverterType": "unstructured", "TokenCount": 5514}}, "3": {"Abstract": "The emergence of various medical large language models (LLMs) in the medical domain has highlighted the need for unified evaluation standards, as manual evaluation of LLMs proves to be time-consuming and labor-intensive. To address this issue, we introduce MedBench, a comprehensive benchmark for the Chinese medical domain, comprising 40,041 questions sourced from authentic examination exercises and medical reports of diverse branches of medicine. In particular, this benchmark is composed of four key components: the Chinese Medical Licensing Examination, the Resident Standardization Training Examination, the Doctor In-Charge Qualification Examination, and real-world clinic cases encompassing examinations, diagnoses, and treatments. MedBench replicates the educational progression and clinical practice experiences of doctors in Mainland China, thereby establishing itself as a credible benchmark for assessing the mastery of knowledge and reasoning abilities in medical language learning models. We perform extensive experiments and conduct an in-depth analysis from diverse perspectives, which culminate in the following findings: (1) Chinese medical LLMs underperform on this benchmark, highlighting the need for significant advances in clinical knowledge and diagnostic precision. (2) Several general-domain LLMs surprisingly possess considerable medical knowledge. These findings elucidate both the capabilities and limitations of LLMs within the context of MedBench, with the ultimate goal of aiding the medical research community.", "ArxivID": "2312.12806v1", "Authors": [{"Affiliations": null, "ForeName": null, "FullName": "Yan Cai", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Linlin Wang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Ye Wang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Gerard de Melo", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Ya Zhang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Yanfeng Wang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Liang He", "HashID": null, "LastName": null, "ORCID": null}], "CiteCrawlerDeep": 1, "CitedBy": null, "DOI": null, "FlagAffiliationMining": 0, "FlagEmbedding": 0, "FlagExtractKG": 0, "FlagExtractTopic": 0, "InsertType": null, "Journal": "Arxiv", "Keywords": null, "NamedEntities": null, "OreginalArticle": {"arxiv:comment": {"#text": "accepted by AAAI-24", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "arxiv:primary_category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "author": [{"name": "Yan Cai"}, {"name": "Linlin Wang"}, {"name": "Ye Wang"}, {"name": "Gerard de Melo"}, {"name": "Ya Zhang"}, {"name": "Yanfeng Wang"}, {"name": "Liang He"}], "category": [{"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL"}, {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.AI"}], "id": "http:https://arxiv.org/abs/2312.12806v1", "link": [{"@href": "http:https://arxiv.org/abs/2312.12806v1", "@rel": "alternate", "@type": "text/html"}, {"@href": "http:https://arxiv.org/pdf/2312.12806v1", "@rel": "related", "@title": "pdf", "@type": "application/pdf"}], "published": "2023-12-20T07:01:49Z", "summary": "The emergence of various medical large language models (LLMs) in the medical\ndomain has highlighted the need for unified evaluation standards, as manual\nevaluation of LLMs proves to be time-consuming and labor-intensive. To address\nthis issue, we introduce MedBench, a comprehensive benchmark for the Chinese\nmedical domain, comprising 40,041 questions sourced from authentic examination\nexercises and medical reports of diverse branches of medicine. In particular,\nthis benchmark is composed of four key components: the Chinese Medical\nLicensing Examination, the Resident Standardization Training Examination, the\nDoctor In-Charge Qualification Examination, and real-world clinic cases\nencompassing examinations, diagnoses, and treatments. MedBench replicates the\neducational progression and clinical practice experiences of doctors in\nMainland China, thereby establishing itself as a credible benchmark for\nassessing the mastery of knowledge and reasoning abilities in medical language\nlearning models. We perform extensive experiments and conduct an in-depth\nanalysis from diverse perspectives, which culminate in the following findings:\n(1) Chinese medical LLMs underperform on this benchmark, highlighting the need\nfor significant advances in clinical knowledge and diagnostic precision. (2)\nSeveral general-domain LLMs surprisingly possess considerable medical\nknowledge. These findings elucidate both the capabilities and limitations of\nLLMs within the context of MedBench, with the ultimate goal of aiding the\nmedical research community.", "title": "MedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large\n Language Models", "updated": "2023-12-20T07:01:49Z"}, "PMC": null, "PMID": null, "Published": "2023-12-20T07:01:49+00:00", "QueryTranslation": "ArXiv Query: search_query=ti:\"large language model\" AND ti:Benchmark AND abs:medical&id_list=&start=0&max_results=10", "ReferenceCrawlerDeep": 1, "References": null, "SourceBank": 2, "State": 5, "Title": "MedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models", "Topics": null, "FullTextMetadata": {"ConverterType": "unstructured", "TokenCount": 5211}}, "4": {"Abstract": "Recent advancements in large language models (LLMs) have transformed the field of question answering (QA). However, evaluating LLMs in the medical field is challenging due to the lack of standardized and comprehensive datasets. To address this gap, we introduce CMExam, sourced from the Chinese National Medical Licensing Examination. CMExam consists of 60K+ multiple-choice questions for standardized and objective evaluations, as well as solution explanations for model reasoning evaluation in an open-ended manner. For in-depth analyses of LLMs, we invited medical professionals to label five additional question-wise annotations, including disease groups, clinical departments, medical disciplines, areas of competency, and question difficulty levels. Alongside the dataset, we further conducted thorough experiments with representative LLMs and QA algorithms on CMExam. The results show that GPT-4 had the best accuracy of 61.6% and a weighted F1 score of 0.617. These results highlight a great disparity when compared to human accuracy, which stood at 71.6%. For explanation tasks, while LLMs could generate relevant reasoning and demonstrate improved performance after finetuning, they fall short of a desired standard, indicating ample room for improvement. To the best of our knowledge, CMExam is the first Chinese medical exam dataset to provide comprehensive medical annotations. The experiments and findings of LLM evaluation also provide valuable insights into the challenges and potential solutions in developing Chinese medical QA systems and LLM evaluation pipelines. The dataset and relevant code are available at https://github.com/williamliujl/CMExam.", "ArxivID": "2306.03030v3", "Authors": [{"Affiliations": null, "ForeName": null, "FullName": "Junling Liu", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Peilin Zhou", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Yining Hua", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Dading Chong", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Zhongyu Tian", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Andrew Liu", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Helin Wang", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Chenyu You", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Zhenhua Guo", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Lei Zhu", "HashID": null, "LastName": null, "ORCID": null}, {"Affiliations": null, "ForeName": null, "FullName": "Michael Lingzhi Li", "HashID": null, "LastName": null, "ORCID": null}], "CiteCrawlerDeep": 1, "CitedBy": null, "DOI": null, "FlagAffiliationMining": 0, "FlagEmbedding": 0, "FlagExtractKG": 0, "FlagExtractTopic": 0, "InsertType": null, "Journal": "Arxiv", "Keywords": null, "NamedEntities": null, "OreginalArticle": {"arxiv:comment": {"#text": "Accepted by NeurIPS 2023 Datasets and Benchmarks Track", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "arxiv:primary_category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL", "@xmlns:arxiv": "http:https://arxiv.org/schemas/atom"}, "author": [{"name": "Junling Liu"}, {"name": "Peilin Zhou"}, {"name": "Yining Hua"}, {"name": "Dading Chong"}, {"name": "Zhongyu Tian"}, {"name": "Andrew Liu"}, {"name": "Helin Wang"}, {"name": "Chenyu You"}, {"name": "Zhenhua Guo"}, {"name": "Lei Zhu"}, {"name": "Michael Lingzhi Li"}], "category": {"@scheme": "http:https://arxiv.org/schemas/atom", "@term": "cs.CL"}, "id": "http:https://arxiv.org/abs/2306.03030v3", "link": [{"@href": "http:https://arxiv.org/abs/2306.03030v3", "@rel": "alternate", "@type": "text/html"}, {"@href": "http:https://arxiv.org/pdf/2306.03030v3", "@rel": "related", "@title": "pdf", "@type": "application/pdf"}], "published": "2023-06-05T16:48:41Z", "summary": "Recent advancements in large language models (LLMs) have transformed the\nfield of question answering (QA). However, evaluating LLMs in the medical field\nis challenging due to the lack of standardized and comprehensive datasets. To\naddress this gap, we introduce CMExam, sourced from the Chinese National\nMedical Licensing Examination. CMExam consists of 60K+ multiple-choice\nquestions for standardized and objective evaluations, as well as solution\nexplanations for model reasoning evaluation in an open-ended manner. For\nin-depth analyses of LLMs, we invited medical professionals to label five\nadditional question-wise annotations, including disease groups, clinical\ndepartments, medical disciplines, areas of competency, and question difficulty\nlevels. Alongside the dataset, we further conducted thorough experiments with\nrepresentative LLMs and QA algorithms on CMExam. The results show that GPT-4\nhad the best accuracy of 61.6% and a weighted F1 score of 0.617. These results\nhighlight a great disparity when compared to human accuracy, which stood at\n71.6%. For explanation tasks, while LLMs could generate relevant reasoning and\ndemonstrate improved performance after finetuning, they fall short of a desired\nstandard, indicating ample room for improvement. To the best of our knowledge,\nCMExam is the first Chinese medical exam dataset to provide comprehensive\nmedical annotations. The experiments and findings of LLM evaluation also\nprovide valuable insights into the challenges and potential solutions in\ndeveloping Chinese medical QA systems and LLM evaluation pipelines. The dataset\nand relevant code are available at https://github.com/williamliujl/CMExam.", "title": "Benchmarking Large Language Models on CMExam -- A Comprehensive Chinese\n Medical Exam Dataset", "updated": "2023-10-23T02:55:08Z"}, "PMC": null, "PMID": null, "Published": "2023-06-05T16:48:41+00:00", "QueryTranslation": "ArXiv Query: search_query=ti:\"large language model\" AND ti:Benchmark AND abs:medical&id_list=&start=0&max_results=10", "ReferenceCrawlerDeep": 1, "References": null, "SourceBank": 2, "State": 5, "Title": "Benchmarking Large Language Models on CMExam -- A Comprehensive Chinese Medical Exam Dataset", "Topics": null, "FullTextMetadata": {"ConverterType": "unstructured", "TokenCount": 9801}}, "5": {"Abstract": "The quickly-expanding nature of published medical literature makes it challenging for clinicians and researchers to keep up with and summarize recent, relevant findings in a timely manner. While several closed-source summarization tools based on large language models (LLMs) now exist, rigorous and systematic evaluations of their outputs are lacking. Furthermore, there is a paucity of high-quality datasets and appropriate benchmark tasks with which to evaluate these tools. We address these issues with four contributions: we release Clinfo.ai, an open-source WebApp that answers clinical questions based on dynamically retrieved scientific literature; we specify an information retrieval and abstractive summarization task to evaluate the performance of such retrieval-augmented LLM systems; we release a dataset of 200 questions and corresponding answers derived from published systematic reviews, which we name PubMed Retrieval and Synthesis (PubMedRS-200); and report benchmark results for Clinfo.ai and other publicly available OpenQA systems on PubMedRS-200.", "ArxivID": null, "Authors": [{"Affiliations": [{"Has_Extra": false, "HashID": "-1775506661586935709", "ParseMethod": null, "Part1": "Department of Biomedical Data Science", "Part2": "Stanford University", "Part3": "Stanford", "Part4": "CA", "Part5": "USA*Equal Contribution", "Part6": "[email protected].", "Structural": null, "Text": "Department of Biomedical Data Science, Stanford University, Stanford, CA, USA*Equal Contribution, [email protected]."}], "ForeName": "Alejandro", "FullName": "Alejandro Lozano", "HashID": "8771235452525963704", "LastName": "Lozano", "ORCID": null}, {"Affiliations": null, "ForeName": "Scott L", "FullName": "Scott L Fleming", "HashID": "-2681292226312938746", "LastName": "Fleming", "ORCID": null}, {"Affiliations": null, "ForeName": "Chia-Chun", "FullName": "Chia-Chun Chiang", "HashID": "7123069067498440076", "LastName": "Chiang", "ORCID": null}, {"Affiliations": null, "ForeName": "Nigam", "FullName": "Nigam Shah", "HashID": "-4982490440114452432", "LastName": "Shah", "ORCID": null}], "CiteCrawlerDeep": 1, "CitedBy": null, "DOI": null, "FlagAffiliationMining": 0, "FlagEmbedding": 0, "FlagExtractKG": 0, "FlagExtractTopic": 0, "InsertType": null, "Journal": "Pacific Symposium on Biocomputing. Pacific Symposium on Biocomputing", "Keywords": [{"IS_Major": false, "IS_Mesh": true, "Text": "Humans"}, {"IS_Major": true, "IS_Mesh": true, "Text": "Natural Language Processing"}, {"IS_Major": true, "IS_Mesh": true, "Text": "Computational Biology"}, {"IS_Major": false, "IS_Mesh": true, "Text": "PubMed"}, {"IS_Major": false, "IS_Mesh": true, "Text": "Information Storage and Retrieval"}, {"IS_Major": false, "IS_Mesh": true, "Text": "Language"}], "NamedEntities": null, "OreginalArticle": {"PubmedArticleSet": {"PubmedArticle": {"MedlineCitation": {"@IndexingMethod": "Automated", "@Owner": "NLM", "@Status": "MEDLINE", "Article": {"@PubModel": "Print", "Abstract": {"AbstractText": "The quickly-expanding nature of published medical literature makes it challenging for clinicians and researchers to keep up with and summarize recent, relevant findings in a timely manner. While several closed-source summarization tools based on large language models (LLMs) now exist, rigorous and systematic evaluations of their outputs are lacking. Furthermore, there is a paucity of high-quality datasets and appropriate benchmark tasks with which to evaluate these tools. We address these issues with four contributions: we release Clinfo.ai, an open-source WebApp that answers clinical questions based on dynamically retrieved scientific literature; we specify an information retrieval and abstractive summarization task to evaluate the performance of such retrieval-augmented LLM systems; we release a dataset of 200 questions and corresponding answers derived from published systematic reviews, which we name PubMed Retrieval and Synthesis (PubMedRS-200); and report benchmark results for Clinfo.ai and other publicly available OpenQA systems on PubMedRS-200."}, "ArticleTitle": "Clinfo.ai: An Open-Source Retrieval-Augmented Large Language Model System for Answering Medical Questions using Scientific Literature.", "AuthorList": {"@CompleteYN": "Y", "Author": [{"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Department of Biomedical Data Science, Stanford University, Stanford, CA, USA*Equal Contribution, [email protected]."}, "ForeName": "Alejandro", "Initials": "A", "LastName": "Lozano"}, {"@ValidYN": "Y", "ForeName": "Scott L", "Initials": "SL", "LastName": "Fleming"}, {"@ValidYN": "Y", "ForeName": "Chia-Chun", "Initials": "CC", "LastName": "Chiang"}, {"@ValidYN": "Y", "ForeName": "Nigam", "Initials": "N", "LastName": "Shah"}]}, "Journal": {"ISOAbbreviation": "Pac Symp Biocomput", "ISSN": {"#text": "2335-6936", "@IssnType": "Electronic"}, "JournalIssue": {"@CitedMedium": "Internet", "PubDate": {"Year": "2024"}, "Volume": "29"}, "Title": "Pacific Symposium on Biocomputing. Pacific Symposium on Biocomputing"}, "Language": "eng", "Pagination": {"EndPage": "23", "MedlinePgn": "8-23", "StartPage": "8"}, "PublicationTypeList": {"PublicationType": {"#text": "Journal Article", "@UI": "D016428"}}}, "CitationSubset": "IM", "DateCompleted": {"Day": "03", "Month": "01", "Year": "2024"}, "DateRevised": {"Day": "06", "Month": "01", "Year": "2024"}, "MedlineJournalInfo": {"Country": "United States", "ISSNLinking": "2335-6928", "MedlineTA": "Pac Symp Biocomput", "NlmUniqueID": "9711271"}, "MeshHeadingList": {"MeshHeading": [{"DescriptorName": {"#text": "Humans", "@MajorTopicYN": "N", "@UI": "D006801"}}, {"DescriptorName": {"#text": "Natural Language Processing", "@MajorTopicYN": "Y", "@UI": "D009323"}}, {"DescriptorName": {"#text": "Computational Biology", "@MajorTopicYN": "Y", "@UI": "D019295"}}, {"DescriptorName": {"#text": "PubMed", "@MajorTopicYN": "N", "@UI": "D039781"}}, {"DescriptorName": {"#text": "Information Storage and Retrieval", "@MajorTopicYN": "N", "@UI": "D016247"}}, {"DescriptorName": {"#text": "Language", "@MajorTopicYN": "N", "@UI": "D007802"}}]}, "PMID": {"#text": "38160266", "@Version": "1"}}, "PubmedData": {"ArticleIdList": {"ArticleId": [{"#text": "38160266", "@IdType": "pubmed"}, {"#text": "9789811286421_0002", "@IdType": "pii"}]}, "History": {"PubMedPubDate": [{"@PubStatus": "medline", "Day": "3", "Hour": "9", "Minute": "43", "Month": "1", "Year": "2024"}, {"@PubStatus": "pubmed", "Day": "2", "Hour": "11", "Minute": "46", "Month": "1", "Year": "2024"}, {"@PubStatus": "entrez", "Day": "31", "Hour": "4", "Minute": "3", "Month": "12", "Year": "2023"}]}, "PublicationStatus": "ppublish"}}}}, "PMC": null, "PMID": "38160266", "Published": null, "QueryTranslation": "\"large language model\"[Title] AND \"Benchmark\"[Title/Abstract]", "ReferenceCrawlerDeep": 1, "References": null, "SourceBank": 1, "State": 3, "Title": "Clinfo.ai: An Open-Source Retrieval-Augmented Large Language Model System for Answering Medical Questions using Scientific Literature.", "Topics": null}, "6": {"Abstract": " To analyze the quality and readability of information regarding shoulder stabilization surgery available using an online AI software (ChatGPT), using standardized scoring systems, as well as to report on the given answers by the AI. An open AI model (ChatGPT) was used to answer 23 commonly asked questions from patients on shoulder stabilization surgery. These answers were evaluated for medical accuracy, quality, and readability using The JAMA Benchmark criteria, DISCERN score, Flesch-Kincaid Reading Ease Score (FRES) & Grade Level (FKGL). The JAMA Benchmark criteria score was 0, which is the lowest score, indicating no reliable resources cited. The DISCERN score was 60, which is considered a good score. The areas that open AI model did not achieve full marks were also related to the lack of available source material used to compile the answers, and finally some shortcomings with information not fully supported by the literature. The FRES was 26.2, and the FKGL was considered to be that of a college graduate. There was generally high quality in the answers given on questions relating to shoulder stabilization surgery, but there was a high reading level required to comprehend the information presented. However, it is unclear where the answers came from with no source material cited. It is important to note that the ChatGPT software repeatedly references the need to discuss these questions with an orthopaedic surgeon and the importance of shared discussion making, as well as compliance with surgeon treatment recommendations. As shoulder instability is an injury that predominantly affects younger individuals who may use the Internet for information, this study shows what information patients may be getting online.", "ArxivID": null, "Authors": [{"Affiliations": [{"Has_Extra": false, "HashID": "833694043597546057", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.. Electronic address: [email protected].", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A.. Electronic address: [email protected]."}], "ForeName": "Eoghan T", "FullName": "Eoghan T Hurley", "HashID": "-6257082949604618993", "LastName": "Hurley", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Bryan S", "FullName": "Bryan S Crook", "HashID": "7372784893760995755", "LastName": "Crook", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Samuel G", "FullName": "Samuel G Lorentz", "HashID": "6314927455442220639", "LastName": "Lorentz", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Richard M", "FullName": "Richard M Danilkowicz", "HashID": "2504652072344007582", "LastName": "Danilkowicz", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Brian C", "FullName": "Brian C Lau", "HashID": "-7353087780429404689", "LastName": "Lau", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Dean C", "FullName": "Dean C Taylor", "HashID": "8708203189260101418", "LastName": "Taylor", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Jonathan F", "FullName": "Jonathan F Dickens", "HashID": "7560353828154771362", "LastName": "Dickens", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Oke", "FullName": "Oke Anakwenze", "HashID": "7960893475344936042", "LastName": "Anakwenze", "ORCID": null}, {"Affiliations": [{"Has_Extra": false, "HashID": "-3970346220490531474", "ParseMethod": null, "Part1": "Duke University", "Part2": "Durham", "Part3": "North Carolina", "Part4": "U.S.A.", "Part5": null, "Part6": null, "Structural": null, "Text": "Duke University, Durham, North Carolina, U.S.A."}], "ForeName": "Christopher S", "FullName": "Christopher S Klifto", "HashID": "-8033363191431007531", "LastName": "Klifto", "ORCID": null}], "CiteCrawlerDeep": 1, "CitedBy": ["38248805", "38024047"], "DOI": "10.1016/j.arthro.2023.07.048", "FlagAffiliationMining": 0, "FlagEmbedding": 0, "FlagExtractKG": 0, "FlagExtractTopic": 0, "InsertType": null, "Journal": "Arthroscopy : the journal of arthroscopic & related surgery : official publication of the Arthroscopy Association of North America and the International Arthroscopy Association", "Keywords": [], "NamedEntities": null, "OreginalArticle": {"PubmedArticleSet": {"PubmedArticle": {"MedlineCitation": {"@IndexingMethod": "Automated", "@Owner": "NLM", "@Status": "Publisher", "Article": {"@PubModel": "Print-Electronic", "Abstract": {"AbstractText": [{"#text": "To analyze the quality and readability of information regarding shoulder stabilization surgery available using an online AI software (ChatGPT), using standardized scoring systems, as well as to report on the given answers by the AI.", "@Label": "PURPOSE", "@NlmCategory": "OBJECTIVE"}, {"#text": "An open AI model (ChatGPT) was used to answer 23 commonly asked questions from patients on shoulder stabilization surgery. These answers were evaluated for medical accuracy, quality, and readability using The JAMA Benchmark criteria, DISCERN score, Flesch-Kincaid Reading Ease Score (FRES) & Grade Level (FKGL).", "@Label": "METHODS", "@NlmCategory": "METHODS"}, {"#text": "The JAMA Benchmark criteria score was 0, which is the lowest score, indicating no reliable resources cited. The DISCERN score was 60, which is considered a good score. The areas that open AI model did not achieve full marks were also related to the lack of available source material used to compile the answers, and finally some shortcomings with information not fully supported by the literature. The FRES was 26.2, and the FKGL was considered to be that of a college graduate.", "@Label": "RESULTS", "@NlmCategory": "RESULTS"}, {"#text": "There was generally high quality in the answers given on questions relating to shoulder stabilization surgery, but there was a high reading level required to comprehend the information presented. However, it is unclear where the answers came from with no source material cited. It is important to note that the ChatGPT software repeatedly references the need to discuss these questions with an orthopaedic surgeon and the importance of shared discussion making, as well as compliance with surgeon treatment recommendations.", "@Label": "CONCLUSION", "@NlmCategory": "CONCLUSIONS"}, {"#text": "As shoulder instability is an injury that predominantly affects younger individuals who may use the Internet for information, this study shows what information patients may be getting online.", "@Label": "CLINICAL RELEVANCE", "@NlmCategory": "CONCLUSIONS"}], "CopyrightInformation": "Copyright \u00a9 2023 Arthroscopy Association of North America. Published by Elsevier Inc. All rights reserved."}, "ArticleDate": {"@DateType": "Electronic", "Day": "09", "Month": "08", "Year": "2023"}, "ArticleTitle": "Evaluation High-Quality of Information from ChatGPT (Artificial Intelligence-Large Language Model) Artificial Intelligence on Shoulder Stabilization Surgery.", "AuthorList": {"@CompleteYN": "Y", "Author": [{"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A.. Electronic address: [email protected]."}, "ForeName": "Eoghan T", "Initials": "ET", "LastName": "Hurley"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Bryan S", "Initials": "BS", "LastName": "Crook"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Samuel G", "Initials": "SG", "LastName": "Lorentz"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Richard M", "Initials": "RM", "LastName": "Danilkowicz"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Brian C", "Initials": "BC", "LastName": "Lau"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Dean C", "Initials": "DC", "LastName": "Taylor"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Jonathan F", "Initials": "JF", "LastName": "Dickens"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Oke", "Initials": "O", "LastName": "Anakwenze"}, {"@ValidYN": "Y", "AffiliationInfo": {"Affiliation": "Duke University, Durham, North Carolina, U.S.A."}, "ForeName": "Christopher S", "Initials": "CS", "LastName": "Klifto"}]}, "ELocationID": [{"#text": "S0749-8063(23)00642-4", "@EIdType": "pii", "@ValidYN": "Y"}, {"#text": "10.1016/j.arthro.2023.07.048", "@EIdType": "doi", "@ValidYN": "Y"}], "Journal": {"ISOAbbreviation": "Arthroscopy", "ISSN": {"#text": "1526-3231", "@IssnType": "Electronic"}, "JournalIssue": {"@CitedMedium": "Internet", "PubDate": {"Day": "09", "Month": "Aug", "Year": "2023"}}, "Title": "Arthroscopy : the journal of arthroscopic & related surgery : official publication of the Arthroscopy Association of North America and the International Arthroscopy Association"}, "Language": "eng", "PublicationTypeList": {"PublicationType": {"#text": "Journal Article", "@UI": "D016428"}}}, "CitationSubset": "IM", "DateRevised": {"Day": "23", "Month": "08", "Year": "2023"}, "MedlineJournalInfo": {"Country": "United States", "ISSNLinking": "0749-8063", "MedlineTA": "Arthroscopy", "NlmUniqueID": "8506498"}, "PMID": {"#text": "37567487", "@Version": "1"}}, "PubmedData": {"ArticleIdList": {"ArticleId": [{"#text": "37567487", "@IdType": "pubmed"}, {"#text": "10.1016/j.arthro.2023.07.048", "@IdType": "doi"}, {"#text": "S0749-8063(23)00642-4", "@IdType": "pii"}]}, "History": {"PubMedPubDate": [{"@PubStatus": "received", "Day": "29", "Month": "3", "Year": "2023"}, {"@PubStatus": "revised", "Day": "27", "Month": "6", "Year": "2023"}, {"@PubStatus": "accepted", "Day": "28", "Month": "7", "Year": "2023"}, {"@PubStatus": "pubmed", "Day": "12", "Hour": "10", "Minute": "42", "Month": "8", "Year": "2023"}, {"@PubStatus": "medline", "Day": "12", "Hour": "10", "Minute": "42", "Month": "8", "Year": "2023"}, {"@PubStatus": "entrez", "Day": "11", "Hour": "19", "Minute": "27", "Month": "8", "Year": "2023"}]}, "PublicationStatus": "aheadofprint"}}}}, "PMC": null, "PMID": "37567487", "Published": null, "QueryTranslation": "\"large language model\"[Title] AND \"Benchmark\"[Title/Abstract]", "ReferenceCrawlerDeep": 1, "References": null, "SourceBank": 1, "State": 3, "Title": "Evaluation High-Quality of Information from ChatGPT (Artificial Intelligence-Large Language Model) Artificial Intelligence on Shoulder Stabilization Surgery.", "Topics": null}}}