{"id":160235,"date":"2010-07-01T00:00:00","date_gmt":"2010-07-01T00:00:00","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/msr-research-item\/intelligent-selection-of-language-model-training-data\/"},"modified":"2018-10-16T20:13:06","modified_gmt":"2018-10-17T03:13:06","slug":"intelligent-selection-of-language-model-training-data","status":"publish","type":"msr-research-item","link":"https:\/\/www.microsoft.com\/en-us\/research\/publication\/intelligent-selection-of-language-model-training-data\/","title":{"rendered":"Intelligent Selection of Language Model Training Data"},"content":{"rendered":"<div class=\"asset-content\">\n<p>We address the problem of selecting nondomain-specific language model training data to build auxiliary language models for use in tasks such as machine translation. Our approach is based on comparing the entropy according to domain-specific and non-domain-specifc language models for sentences of the text source used to produce the latter language model. We show that this produces better language models, trained on less data, than either random data selection or a previous method based on measuring perplexity according to a domain-specific language model.<\/p>\n<\/div>\n<p><!-- .asset-content --><\/p>\n","protected":false},"excerpt":{"rendered":"<p>We address the problem of selecting nondomain-specific language model training data to build auxiliary language models for use in tasks such as machine translation. Our approach is based on comparing the entropy according to domain-specific and non-domain-specifc language models for sentences of the text source used to produce the latter language model. We show that [&hellip;]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-author-ordering":null,"msr_publishername":"Association for Computational Linguistics","msr_publisher_other":"","msr_booktitle":"","msr_chapter":"","msr_edition":"Proceedings of the ACL 2010 Conference Short Papers","msr_editors":"","msr_how_published":"","msr_isbn":"","msr_issue":"","msr_journal":"","msr_number":"","msr_organization":"","msr_pages_string":"220\u2013224","msr_page_range_start":"220","msr_page_range_end":"224","msr_series":"","msr_volume":"","msr_copyright":"","msr_conference_name":"Proceedings of the ACL 2010 Conference Short Papers","msr_doi":"","msr_arxiv_id":"","msr_s2_paper_id":"","msr_mag_id":"","msr_pubmed_id":"","msr_other_authors":"Robert C. Moore, William Lewis","msr_other_contributors":"","msr_speaker":"","msr_award":"","msr_affiliation":"","msr_institution":"","msr_host":"","msr_version":"","msr_duration":"","msr_original_fields_of_study":"","msr_release_tracker_id":"","msr_s2_match_type":"","msr_citation_count_updated":"","msr_published_date":"2010-07-01","msr_highlight_text":"","msr_notes":"","msr_longbiography":"","msr_publicationurl":"http:\/\/www.aclweb.org\/anthology\/P10-2041","msr_external_url":"","msr_secondary_video_url":"","msr_conference_url":"","msr_journal_url":"","msr_s2_pdf_url":"","msr_year":2010,"msr_citation_count":0,"msr_influential_citations":0,"msr_reference_count":0,"msr_s2_match_confidence":0,"msr_microsoftintellectualproperty":true,"msr_s2_open_access":false,"msr_s2_author_ids":[],"msr_pub_ids":[],"msr_hide_image_in_river":0,"footnotes":""},"msr-research-highlight":[],"research-area":[13560],"msr-publication-type":[193716],"msr-publisher":[],"msr-focus-area":[],"msr-locale":[268875],"msr-post-option":[],"msr-field-of-study":[],"msr-conference":[],"msr-journal":[],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-160235","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-programming-languages-software-engineering","msr-locale-en_us"],"msr_publishername":"Association for Computational Linguistics","msr_edition":"Proceedings of the ACL 2010 Conference Short Papers","msr_affiliation":"","msr_published_date":"2010-07-01","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"","msr_pages_string":"220\u2013224","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"457290","msr_publicationurl":"http:\/\/www.aclweb.org\/anthology\/P10-2041","msr_doi":"","msr_publication_uploader":[{"type":"file","title":"ntelligent Selection of Language Model Training Data","viewUrl":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2010\/07\/P10-2041.pdf","id":457290,"label_id":0},{"type":"url","title":"http:\/\/www.aclweb.org\/anthology\/P10-2041","viewUrl":false,"id":false,"label_id":0}],"msr_related_uploader":"","msr_citation_count":0,"msr_citation_count_updated":"","msr_s2_paper_id":"","msr_influential_citations":0,"msr_reference_count":0,"msr_arxiv_id":"","msr_s2_author_ids":[],"msr_s2_open_access":false,"msr_s2_pdf_url":null,"msr_attachments":[{"id":0,"url":"http:\/\/www.aclweb.org\/anthology\/P10-2041"}],"msr-author-ordering":[{"type":"text","value":"Robert C. Moore","user_id":0,"rest_url":false},{"type":"user_nicename","value":"wilewis","user_id":34835,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=wilewis"}],"msr_impact_theme":[],"msr_research_lab":[],"msr_event":[],"msr_group":[],"msr_project":[],"publication":[],"video":[],"msr-tool":[],"msr_publication_type":"inproceedings","related_content":[],"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/160235","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":1,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/160235\/revisions"}],"predecessor-version":[{"id":524511,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/160235\/revisions\/524511"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=160235"}],"wp:term":[{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=160235"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=160235"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=160235"},{"taxonomy":"msr-publisher","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publisher?post=160235"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=160235"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=160235"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=160235"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=160235"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=160235"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=160235"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=160235"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=160235"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}