@inproceedings{clarke-etal-2023-label, title = {Label Agnostic Pre-training for Zero-shot Text Classification}, author = {Christopher Clarke and Yuzhao Heng and Yiping Kang and Krisztian Flautner and Lingjia Tang and Jason Mars}, url = {https://aclanthology.org/2023.findings-acl.64}, doi = {10.18653/v1/2023.findings-acl.64}, year = {2023}, date = {2023-07-01}, booktitle = {Findings of the Association for Computational Linguistics: ACL 2023}, pages = {1009–1021}, publisher = {Association for Computational Linguistics}, address = {Toronto, Canada}, abstract = {Conventional approaches to text classification typically assume the existence of a fixed set of predefined labels to which a given text can be classified. However, in real-world applications, there exists an infinite label space for describing a given text. In addition, depending on the aspect (sentiment, topic, etc.) and domain of the text (finance, legal, etc.), the interpretation of the label can vary greatly. This makes the task of text classification, particularly in the zero-shot scenario, extremely challenging. In this paper, we investigate the task of zero-shot text classification with the aim of improving the ability of pre-trained language models (PLMs) to generalize to both seen and unseen data across varying aspects and domains. To solve this we introduce two new simple yet effective pre-training strategies, Implicit and Explicit pre-training. These methods inject aspect-level understanding into the model at train time with the goal of conditioning the model to build task-level understanding. To evaluate this, we construct and release UTCD, a new benchmark dataset for evaluating text classification in zero-shot settings. Experimental results on UTCD show that our approach achieves improved zero-shot generalization on a suite of challenging datasets across an array of zero-shot formalizations.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{clarke-etal-2023-rule, title = {Rule By Example: Harnessing Logical Rules for Explainable Hate Speech Detection}, author = {Christopher Clarke and Matthew Hall and Gaurav Mittal and Ye Yu and Sandra Sajeev and Jason Mars and Mei Chen}, url = {https://aclanthology.org/2023.acl-long.22}, doi = {10.18653/v1/2023.acl-long.22}, year = {2023}, date = {2023-07-01}, booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, pages = {364–376}, publisher = {Association for Computational Linguistics}, address = {Toronto, Canada}, abstract = {Classic approaches to content moderation typically apply a rule-based heuristic approach to flag content. While rules are easily customizable and intuitive for humans to interpret, they are inherently fragile and lack the flexibility or robustness needed to moderate the vast amount of undesirable content found online today. Recent advances in deep learning have demonstrated the promise of using highly effective deep neural models to overcome these challenges. However, despite the improved performance, these data-driven models lack transparency and explainability, often leading to mistrust from everyday users and a lack of adoption by many platforms. In this paper, we present Rule By Example (RBE): a novel exemplar-based contrastive learning approach for learning from logical rules for the task of textual content moderation. RBE is capable of providing rule-grounded predictions, allowing for more explainable and customizable predictions compared to typical deep learning-based approaches. We demonstrate that our approach is capable of learning rich rule embedding representations using only a few data examples. Experimental results on 3 popular hate speech classification datasets show that RBE is able to outperform state-of-the-art deep learning classifiers as well as the use of rules in both supervised and unsupervised settings while providing explainable model predictions via rule-grounding.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{10129141, title = {The Jaseci Programming Paradigm and Runtime Stack: Building Scale-Out Production Applications Easy and Fast}, author = {Jason Mars and Yiping Kang and Roland Daynauth and Baichuan Li and Ashish Mahendra and Krisztian Flautner and Lingjia Tang}, doi = {10.1109/LCA.2023.3274038}, year = {2023}, date = {2023-01-01}, urldate = {2023-01-01}, journal = {IEEE Computer Architecture Letters}, volume = {22}, number = {2}, pages = {101-104}, abstract = {Today's production scale-out applications include many sub-application components, such as storage backends, logging infrastructure and AI models. These components have drastically different characteristics, are required to work in collaboration, and interface with each other as microservices. This leads to increasingly high complexity in developing, optimizing, configuring, and deploying scale-out applications, raising the barrier to entry for most individuals and small teams. We developed a novel co-designed runtime system, Jaseci , and programming language, Jac , which aims to reduce this complexity. The key design principle throughout Jaseci's design is to raise the level of abstraction by moving as much of the scale-out data management, microservice componentization, and live update complexity into the runtime stack to be automated and optimized automatically. We use real-world AI applications to demonstrate Jaseci's benefit for application performance and developer productivity.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{clarke-etal-2022-one, title = {Öne Agent To Rule Them All: Towards Multi-agent Conversational AI}, author = {Christopher Clarke and Joseph Peper and Karthik Krishnamurthy and Walter Talamonti and Kevin Leach and Walter Lasecki and Yiping Kang and Lingjia Tang and Jason Mars}, url = {https://aclanthology.org/2022.findings-acl.257}, doi = {10.18653/v1/2022.findings-acl.257}, year = {2022}, date = {2022-05-01}, urldate = {2022-05-01}, booktitle = {Findings of the Association for Computational Linguistics: ACL 2022}, pages = {3258--3267}, publisher = {Ässociation for Computational Linguistics}, address = {Dublin, Ireland}, abstract = {The increasing volume of commercially available conversational agents (CAs) on the market has resulted in users being burdened with learning and adopting multiple agents to accomplish their tasks. Though prior work has explored supporting a multitude of domains within the design of a single agent, the interaction experience suffers due to the large action space of desired capabilities. To address these problems, we introduce a new task BBAI: Black-Box Agent Integration, focusing on combining the capabilities of multiple black-box CAs at scale. We explore two techniques: question agent pairing and question response pairing aimed at resolving this task. Leveraging these techniques, we design One For All (OFA), a scalable system that provides a unified interface to interact with multiple CAs. Additionally, we introduce MARS: Multi-Agent Response Selection, a new encoder model for question response pairing that jointly encodes user question and agent response pairs. We demonstrate that OFA is able to automatically and accurately integrate an ensemble of commercially available CAs spanning disparate domains. Specifically, using the MARS encoder we achieve the highest accuracy on our BBAI task, outperforming strong baselines.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @misc{larson2020systems, title = {Systems and methods for automatically configuring training data for training machine learning models of a machine learning-based dialogue system including seeding training samples or curating a corpus of training data based on instances of training data identified as anomalous}, author = {Stefan Larson and Anish Mahendran and Andrew Lee and Jonathan K Kummerfeld and Parker Hill and Michael A Laurenzano and Johann Hauswald and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/US9117447.pdf}, year = {2020}, date = {2020-06-01}, abstract = {A system and method for improving a machine learning-based dialogue system includes: sourcing a corpus of raw machine learning training data from sources of training data based on a plurality of seed training samples, wherein the corpus of raw machine learning training data comprises a plurality of distinct instances of training data; generating a vector representation for each distinct instance of training data; identifying statistical characteristics of the corpus of raw machine learning training data based on a mapping of the vector representation for each distinct instance of training data; identifying anomalous instances of the plurality of distinct instances of training data of the corpus of raw machine learning training data based on the identified statistical characteristics of the corpus; and curating the corpus of raw machine learning training data based on each of the instances of training data identified as anomalous instances.}, note = {US Patent 10,679,150}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @misc{mars2020system, title = {System and method for implementing an artificially intelligent virtual assistant using machine learning}, author = {Jason Mars and Lingjia Tang and Michael Laurenzano and Johann Hauswald and Parker Hill}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/US20190130244A1.pdf}, year = {2020}, date = {2020-02-01}, abstract = {Systems and methods for implementing an artificially intelligent virtual assistant includes collecting a user query; using a competency classification machine learning model to generate a competency label for the user query; using a slot identification machine learning model to segment the text of the query and label each of the slots of the query; generating a slot value for each of the slots of the query; generating a handler for each of the slot values; and using the slot values to: identify an external data source relevant to the user query, fetch user data from the external data source, and apply one or more operations to the query to generate response data; and using the response data, to generate a response to the user query.}, note = {US Patent 10,572,801}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @misc{kang2020systems, title = {Systems and methods for intelligently curating machine learning training data and improving machine learning model performance}, author = {Yiping Kang and Yunqi Zhang and Jonathan K Kummerfeld and Parker Hill and Johann Hauswald and Michael A Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/US10679100.pdf}, year = {2020}, date = {2020-01-01}, abstract = {Systems and methods of intelligent formation and acquisition of machine learning training data for implementing an artificially intelligent dialogue system includes constructing a corpora of machine learning test corpus that comprise a plurality of historical queries and commands sampled from production logs of a deployed dialogue system; configuring training data sourcing parameters to source a corpora of raw machine learning training data from remote sources of machine learning training data; calculating efficacy metrics of the corpora of raw machine learning training data, wherein calculating the efficacy metrics includes calculating one or more of a coverage metric value and a diversity metric value of the corpora of raw machine learning training data; using the corpora of raw machine learning training data to train the at least one machine learning classifier if the calculated coverage metric value of the corpora of machine learning training data satisfies a minimum coverage metric threshold.}, note = {US Patent 10,679,100}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @inproceedings{liu2020benchmarking, title = {A Benchmarking Framework for Interactive 3D Applications in the Cloud}, author = {Tianyi Liu and Sen He and Sunzhou Huang and Danny Tsang and Lingjia Tang and Jason Mars and Wei Wang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/2006.13378.pdf}, year = {2020}, date = {2020-01-01}, urldate = {2020-01-01}, booktitle = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, pages = {881--894}, organization = {IEEE}, abstract = {With the growing popularity of cloud gaming and cloud virtual reality (VR), interactive 3D applications have become a major class of workloads for the cloud. However, despite their growing importance, there is limited public research on how to design cloud systems to efficiently support these applications due to the lack of an open and reliable research infrastructure, including benchmarks and performance analysis tools. The challenges of generating human-like inputs under various system/application nondeterminism and dissecting the performance of complex graphics systems make it very difficult to design such an infrastructure. In this paper, we present the design of a novel research infrastructure, Pictor, for cloud 3D applications and systems. Pictor employs AI to mimic human interactions with complex 3D applications. It can also track the processing of user inputs to provide in-depth performance measurements for the complex software and hardware stack used for cloud 3D-graphics rendering. With Pictor, we designed a benchmark suite with six interactive 3D applications. Performance analyses were conducted with these benchmarks, which show that cloud system designs, including both system software and hardware designs, are crucial to the performance of cloud 3D applications. The analyses also show that energy consumption can be reduced by at least 37% when two 3D applications share a could server. To demonstrate the effectiveness of Pictor, we also implemented two optimizations to address two performance bottlenecks discovered in a state-of-the-art cloud 3D-graphics rendering system. These two optimizations improved the frame rate by 57.7% on average.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @misc{mars2020systems, title = {Systems and methods for intelligently configuring and deploying a machine learning-based dialogue system}, author = {Jason Mars and Lingjia Tang and Michael A Laurenzano and Johann Hauswald and Parker Hill and Yiping Kang and Yunqi Zhang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/US10769384.pdf}, year = {2020}, date = {2020-01-01}, abstract = {A system and method for intelligently configuring a machine learning-based dialogue system includes a conversational deficiency assessment of a target dialog system, wherein implementing the conversational deficiency assessment includes: (i) identifying distinct corpora of mishandled utterances based on an assessment of the distinct corpora of dialogue data; (ii) identifying candidate corpus of mishandled utterances from the distinct corpora of mishandled utterances as suitable candidates for building new dialogue competencies for the target dialogue system if candidate metrics of the candidate corpus of mishandled utterances satisfy a candidate threshold; building the new dialogue competencies for the target dialogue system for each of the candidate corpus of mishandled utterances having candidate metrics that satisfy the candidate threshold; and configuring a dialogue system control structure for the target dialogue system based on the new dialogue competencies, wherein the dialogue system control structure governs an operation of an automated dialogue agent.}, note = {US Patent 10,769,384}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @misc{peper2020systems, title = {Systems and methods for machine learning-based multi-intent segmentation and classification}, author = {Joseph Peper and Parker Hill and Kevin Leach and Sean Stapleton and Jonathan K Kummerfeld and Johann Hauswald and Michael Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/US10824818.pdf}, year = {2020}, date = {2020-01-01}, abstract = {Systems and methods for synthesizing training data for multi-intent utterance segmentation include identifying a first corpus of utterances comprising a plurality of distinct single-intent in-domain utterances; identifying a second corpus of utterances comprising a plurality of distinct single-intent out-of-domain utterances; identifying a third corpus comprising a plurality of distinct conjunction terms; forming a multi-intent training corpus comprising synthetic multi-intent utterances, wherein forming each distinct multi-intent utterance includes: selecting a first distinct in-domain utterance from the first corpus of utterances; probabilistically selecting one of a first out-of-domain utterance from the second corpus and a second in-domain utterance from the first corpus; probabilistically selecting or not selecting a distinct conjunction term from the third corpus; and forming a synthetic multi-intent utterance including appending the first in-domain utterance with one of the first out-of-domain utterance from the second corpus of utterances and the second in-domain utterance from the first corpus of utterances.}, note = {US Patent 10,824,818}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @misc{lee2020systems, title = {Systems and methods for constructing an artificially diverse corpus of training data samples for training a contextually-biased model for a machine learning-based dialogue system}, author = {Andrew Lee and Stefan Larson and Christopher Clarke and Kevin Leach and Jonathan K Kummerfeld and Parker Hill and Johann Hauswald and Michael A Laurenzano and Lingjia Tang and Jason Mars and others}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/US10796104.pdf}, year = {2020}, date = {2020-01-01}, abstract = {Systems and methods for constructing an artificially diverse corpus of training data includes evaluating a corpus of utterance-based training data samples, identifying a slot replacement candidate; deriving distinct skeleton utterances that include the slot replacement candidate, wherein deriving the distinct skeleton utterances includes replacing slots of each of the plurality of distinct utterance training samples with one of a special token and proper slot classification labels; selecting a subset of the distinct skeleton utterances; converting each of the distinct skeleton utterances of the subset back to distinct utterance training samples while still maintaining the special token at a position of the slot replacement candidate; altering a percentage of the distinct utterance training samples with a distinct randomly-generated slot token value at the position of the slot replacement candidate; and constructing the artificially diverse corpus of training samples based on a collection of the percentage of the distinct utterance training samples.}, note = {US Patent 10,796,104}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @article{Liu2020ABF, title = {A Benchmarking Framework for Interactive 3D Applications in the Cloud}, author = {Tianyi Liu and Sen He and Sunzhou Huang and Danny Tsang and Lingjia Tang and Jason Mars and Wei Wang}, year = {2020}, date = {2020-01-01}, journal = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, pages = {881-894}, keywords = {}, pubstate = {published}, tppubtype = {article} } @misc{kang2019systems, title = {Systems and methods for intelligently curating machine learning training data and improving machine learning model performance}, author = {Yiping Kang and Yunqi Zhang and Jonathan K Kummerfeld and Parker Hill and Johann Hauswald and Michael A Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/US20190294925A1.pdf}, year = {2019}, date = {2019-05-01}, abstract = {Systems and methods of intelligent formation and acquisition of machine learning training data for implementing an artificially intelligent dialogue system includes constructing a corpora of machine learning test corpus that comprise a plurality of historical queries and commands sampled from production logs of a deployed dialogue system; configuring training data sourcing parameters to source a corpora of raw machine learning training data from remote sources of machine learning training data; calculating efficacy metrics of the corpora of raw machine learning training data, wherein calculating the efficacy metrics includes calculating one or more of a coverage metric value and a diversity metric value of the corpora of raw machine learning training data; using the corpora of raw machine learning training data to train the at least one machine learning classifier if the calculated coverage metric value of the corpora of machine learning training data satisfies a minimum coverage metric threshold.}, note = {US Patent 10,303,978}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @inproceedings{kannan2019grandslam, title = {Grandslam: Guaranteeing slas for jobs in microservices execution frameworks}, author = {Ram Srivatsa Kannan and Lavanya Subramanian and Ashwin Raju and Jeongseob Ahn and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3302424.3303958.pdf}, year = {2019}, date = {2019-01-01}, booktitle = {Proceedings of the Fourteenth EuroSys Conference 2019}, pages = {1--16}, abstract = {The microservice architecture has dramatically reduced user effort in adopting and maintaining servers by providing a catalog of functions as services that can be used as building blocks to construct applications. This has enabled datacenter operators to look at managing datacenter hosting microservices quite differently from traditional infrastructures. Such a paradigm shift calls for a need to rethink resource management strategies employed in such execution environments. We observe that the visibility enabled by a microservices execution framework can be exploited to achieve high throughput and resource utilization while still meeting Service Level Agreements, especially in multi-tenant execution scenarios. In this study, we present GrandSLAm, a microservice execution framework that improves utilization of datacenters hosting microservices. GrandSLAm estimates time of completion of requests propagating through individual microservice stages within an application. It then leverages this estimate to drive a runtime system that dynamically batches and reorders requests at each microservice in a manner where individual jobs meet their respective target latency while achieving high throughput. GrandSLAm significantly increases throughput by up to 3x compared to the our baseline, without violating SLAs for a wide range of real-world AI and ML applications.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{arora2019understanding, title = {Understanding the Impact of Socket Density in Density Optimized Servers}, author = {Manish Arora and Matt Skach and Wei Huang and Xudong An and Jason Mars and Lingjia Tang and Dean M Tullsen}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08675196.pdf}, year = {2019}, date = {2019-01-01}, booktitle = {2019 IEEE International Symposium on High Performance Computer Architecture (HPCA)}, pages = {687--700}, organization = {IEEE}, abstract = {The increasing demand for computational power has led to the creation and deployment of large-scale data centers. During the last few years, data centers have seen improvements aimed at increasing computational density - the amount of throughput that can be achieved within the allocated physical footprint. This need to pack more compute in the same physical space has led to density optimized server designs. Density optimized servers push compute density significantly beyond what can be achieved by blade servers by using innovative modular chassis based designs. This paper presents a comprehensive analysis of the impact of socket density on intra-server thermals and demonstrates that increased socket density inside the server leads to large temperature variations among sockets due to inter-socket thermal coupling. The paper shows that traditional chip-level and data center-level temperature-aware scheduling techniques do not work well for thermally-coupled sockets. The paper proposes new scheduling techniques that account for the thermals of the socket a task is scheduled on, as well as thermally coupled nearby sockets. The proposed mechanisms provide 2.5% to 6.5% performance improvements across various workloads and as much as 17% over traditional temperature-aware schedulers for computation-heavy workloads.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{larson2019outlier, title = {Outlier Detection for Improved Data Quality and Diversity in Dialog Systems}, author = {Stefan Larson and Anish Mahendran and Andrew Lee and Jonathan K Kummerfeld and Parker Hill and Michael A Laurenzano and Johann Hauswald and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/N19-1051.pdf}, year = {2019}, date = {2019-01-01}, journal = {Proceedings of NAACL-HLT 2019}, pages = {517–527}, abstract = {In a corpus of data, outliers are either errors: mistakes in the data that are counterproductive, or are unique: informative samples that improve model robustness. Identifying outliers can lead to better datasets by (1) removing noise in datasets and (2) guiding collection of additional data to fill gaps. However, the problem of detecting both outlier types has received relatively little attention in NLP, particularly for dialog systems. We introduce a simple and effective technique for detecting both erroneous and unique samples in a corpus of short texts using neural sentence embeddings combined with distance-based outlier detection. We also present a novel data collection pipeline built atop our detection technique to automatically and iteratively mine unique data samples while discarding erroneous samples. Experiments show that our outlier detection technique is effective at finding errors while our data collection pipeline yields highly diverse corpora that in turn produce more robust intent classification and slot-filling models.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @misc{tang2019system, title = {System and methods for sharing memory subsystem resources among datacenter applications}, author = {Lingjia Tang and Jason Mars and Robert Hundt}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/US9401869.pdf}, year = {2019}, date = {2019-01-01}, abstract = {Systems and methods for mapping applications onto system resource of a computing platform are discussed. The computing platform may receive, using control circuitry, a request to run a plurality of applications on a computing platform having a plurality of system resources. The computing platform may determine a plurality of mapping configurations for the plurality of applications onto the plurality of system resources. The computing platform may execute the plurality of applications with each of the plurality of mapping configurations. The computing platform may determine at least one performance metric based on the executed plurality of applications for each of the plurality of mapping configurations. The computing platform may select a selected mapping configuration among the plurality of mapping configurations based on at least one determined performance metric.}, note = {US Patent 10,313,265}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @article{kannan2019caliper, title = {Caliper: Interference estimator for multi-tenant environments sharing architectural resources}, author = {Ram Srivatsa Kannan and Michael Laurenzano and Jeongseob Ahn and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3323090.pdf}, year = {2019}, date = {2019-01-01}, journal = {ACM Transactions on Architecture and Code Optimization (TACO)}, volume = {16}, number = {3}, pages = {1--25}, publisher = {ACM New York, NY, USA}, abstract = {We introduce Caliper, a technique for accurately estimating performance interference occurring in shared servers. Caliper overcomes the limitations of prior approaches by leveraging a micro-experiment-based technique. In contrast to state-of-the-art approaches that focus on periodically pausing co-running applications to estimate slowdown, Caliper utilizes a strategic phase-triggered technique to capture interference due to co-location. This enables Caliper to orchestrate an accurate and low-overhead interference estimation technique that can be readily deployed in existing production systems. We evaluate Caliper for a broad spectrum of workload scenarios, demonstrating its ability to seamlessly support up to 16 applications running simultaneously and outperform the state-of-the-art approaches.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @article{larson2019evaluation, title = {An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction}, author = {Stefan Larson and Anish Mahendran and Joseph J Peper and Christopher Clarke and Andrew Lee and Parker Hill and Jonathan K Kummerfeld and Kevin Leach and Michael A Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/D19-1131.pdf}, year = {2019}, date = {2019-01-01}, journal = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing}, pages = {1311–1316}, abstract = {Task-oriented dialog systems need to know when a query falls outside their range of supported intents, but current text classification corpora only define label sets that cover every example. We introduce a new dataset that includes queries that are out-of-scope---i.e., queries that do not fall into any of the system's supported intents. This poses a new challenge because models cannot assume that every query at inference time belongs to a system-supported intent class. Our dataset also covers 150 intent classes over 10 domains, capturing the breadth that a production task-oriented agent must handle. We evaluate a range of benchmark classifiers on our dataset along with several different out-of-scope identification schemes. We find that while the classifiers perform well on in-scope intent classification, they struggle to identify out-of-scope queries. Our dataset and evaluation fill an important gap in the field, offering a way of more rigorously and realistically benchmarking text classification in task-driven dialog systems.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @article{hill2018rethinking, title = {Rethinking numerical representations for deep neural networks}, author = {Parker Hill and Babak Zamirai and Shengshuo Lu and Yu-Wei Chao and Michael Laurenzano and Mehrzad Samadi and Marios Papaefthymiou and Scott Mahlke and Thomas Wenisch and Jia Deng and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/1808.02513.pdf}, year = {2018}, date = {2018-01-01}, journal = {arXiv preprint arXiv:1808.02513}, abstract = {With ever-increasing computational demand for deep learning, it is critical to investigate the implications of the numeric representation and precision of DNN model weights and activations on computational efficiency. In this work, we explore unconventional narrow-precision floating-point representations as it relates to inference accuracy and efficiency to steer the improved design of future DNN platforms. We show that inference using these custom numeric representations on production-grade DNNs, including GoogLeNet and VGG, achieves an average speedup of 7.6x with less than 1% degradation in inference accuracy relative to a state-of-the-art baseline platform representing the most sophisticated hardware using single-precision floating point. To facilitate the use of such customized precision, we also present a novel technique that drastically reduces the time required to derive the optimal precision configuration. }, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{lin2018architectural, title = {The architectural implications of autonomous driving: Constraints and acceleration}, author = {Shih-Chieh Lin and Yunqi Zhang and Chang-Hong Hsu and Matt Skach and Md E Haque and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/AutonomousCar-ASPLOS18.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems}, pages = {751--766}, abstract = {Autonomous driving systems have attracted a significant amount of interest recently, and many industry leaders, such as Google, Uber, Tesla, and Mobileye, have invested a large amount of capital and engineering power on developing such systems. Building autonomous driving systems is particularly challenging due to stringent performance requirements in terms of both making the safe operational decisions and finishing processing at real-time. Despite the recent advancements in technology, such systems are still largely under experimentation and architecting end-to-end autonomous driving systems remains an open research question. To investigate this question, we first present and formalize the design constraints for building an autonomous driving system in terms of performance, predictability, storage, thermal and power. We then build an end-to-end autonomous driving system using state-of-the-art award-winning algorithms to understand the design trade-offs for building such systems. In our real-system characterization, we identify three computational bottlenecks, which conventional multicore CPUs are incapable of processing under the identified design constraints. To meet these constraints, we accelerate these algorithms using three accelerator platforms including GPUs, FPGAs, and ASICs, which can reduce the tail latency of the system by 169x, 10x, and 93x respectively. With accelerator-based designs, we are able to build an end-to-end autonomous driving system that meets all the design constraints, and explore the trade-offs among performance, power and the higher accuracy enabled by higher resolution cameras.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{hsu2018smoothoperator, title = {Smoothoperator: Reducing power fragmentation and improving power utilization in large-scale datacenters}, author = {Chang-Hong Hsu and Qingyuan Deng and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/smooth_operator.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems}, pages = {535--548}, abstract = {With the ever growing popularity of cloud computing and web services, Internet companies are in need of increased computing capacity to serve the demand. However, power has become a major limiting factor prohibiting the growth in industry: it is often the case that no more servers can be added to datacenters without surpassing the capacity of the existing power infrastructure. In this work, we first investigate the power utilization in Facebook datacenters. We observe that the combination of provisioning for peak power usage, highly fluctuating traffic, and multi-level power delivery infrastructure leads to significant power budget fragmentation problem and inefficiently low power utilization. To address this issue, our insight is that heterogeneity of power consumption patterns among different services provides opportunities to re-shape the power profile of each power node by re-distributing services. By grouping services with asynchronous peak times under the same power node, we can reduce the peak power of each node and thus creating more power head-rooms to allow more servers hosted, achieving higher throughput. Based on this insight, we develop a workload-aware service placement framework to systematically spread the service instances with synchronous power patterns evenly under the power supply tree, greatly reducing the peak power draw at power nodes. We then leverage dynamic power profile reshaping to maximally utilize the headroom unlocked by our placement framework. Our experiments based on real production workload and power traces show that we are able to host up to 13% more machines in production, without changing the underlying power infrastructure. Utilizing the unleashed power headroom with dynamic reshaping, we achieve up to an estimated total of 15% and 11% throughput improvement for latency-critical service and batch service respectively at the same time, with up to 44% of energy slack reduction.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{jain2018gist, title = {Gist: Efficient data encoding for deep neural network training}, author = {Animesh Jain and Amar Phanishayee and Jason Mars and Lingjia Tang and Gennady Pekhimenko}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08416872.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)}, pages = {776--789}, organization = {IEEE}, abstract = {Modern deep neural networks (DNNs) training typically relies on GPUs to train complex hundred-layer deep networks. A significant problem facing both researchers and industry practitioners is that, as the networks get deeper, the available GPU main memory becomes a primary bottleneck, limiting the size of networks it can train. In this paper, we investigate widely used DNNs and find that the major contributors to memory footprint are intermediate layer outputs (feature maps). We then introduce a framework for DNN-layer-specific optimizations (e.g., convolution, ReLU, pool) that significantly reduce this source of main memory pressure on GPUs. We find that a feature map typically has two uses that are spread far apart temporally. Our key approach is to store an encoded representation of feature maps for this temporal gap and decode this data for use in the backward pass; the full-fidelity feature maps are used in the forward pass and relinquished immediately. Based on this approach, we present Gist, our system that employs two classes of layer-specific encoding schemes - lossless and lossy - to exploit existing value redundancy in DNN training to significantly reduce the memory consumption of targeted feature maps. For example, one insight is by taking advantage of the computational nature of back propagation from pool to ReLU layer, we can store the intermediate feature map using just 1 bit instead of 32 bits per value. We deploy these mechanisms in a state-of-the-art DNN framework (CNTK) and observe that Gist reduces the memory footprint to upto 2× across 5 state-of-the-art image classification DNNs, with an average of 1.8× with only 4% performance overhead. We also show that further software (e.g., CuDNN) and hardware (e.g., dynamic allocation) optimizations can result in even larger footprint reduction (upto 4.1×).}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{kang2018data, title = {Data collection for dialogue system: A startup perspective}, author = {Yiping Kang and Yunqi Zhang and Jonathan K Kummerfeld and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/N18-3005.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers)}, pages = {33--40}, abstract = {Industrial dialogue systems such as Apple Siri and Google Now rely on large scale diverse and robust training data to enable their sophisticated conversation capability. Crowdsourcing provides a scalable and inexpensive way of data collection but collecting high quality data efficiently requires thoughtful orchestration of the crowdsourcing jobs. Prior study of this topic have focused on tasks only in the academia settings with limited scope or only provide intrinsic dataset analysis, lacking indication on how it affects the trained model performance. In this paper, we present a study of crowdsourcing methods for a user intent classification task in our deployed dialogue system. Our task requires classification of 47 possible user intents and contains many intent pairs with subtle differences. We consider different crowdsourcing job types and job prompts and analyze quantitatively the quality of the collected data and the downstream model performance on a test set of real user queries from production logs. Our observation provides insights into designing efficient crowdsourcing jobs and provide recommendations for future dialogue system data collection process.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{kannan2018proctor, title = {Proctor: Detecting and investigating interference in shared datacenters}, author = {Ram Srivatsa Kannan and Animesh Jain and Michael A Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08366937.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)}, pages = {76--86}, organization = {IEEE}, abstract = {Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance intrusion and their root causes.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{skach2018virtual, title = {Virtual melting temperature: managing server load to minimize cooling overhead with phase change materials}, author = {Matt Skach and Manish Arora and Dean Tullsen and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08416815.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)}, pages = {15--28}, organization = {IEEE}, abstract = {As the power density and power consumption of large scale datacenters continue to grow, the challenges of removing heat from these datacenters and keeping them cool is an increasingly urgent and costly. With the largest datacenters now exceeding over 200 MW of power, the cooling systems that prevent overheating cost on the order of tens of millions of dollars. Prior work proposed to deploy phase change materials (PCM) and use Thermal Time Shifting (TTS) to reshape the thermal load of a datacenter by storing heat during peak hours of high utilization and releasing it during off hours when utilization is low, enabling a smaller cooling system to handle the same peak load. The peak cooling load reduction enabled by TTS is greatly beneficial, however TTS is a passive system that cannot handle many workload mixtures or adapt to changing load or environmental characteristics. In this work we propose VMT, a thermal aware job placement technique that adds an active, tunable component to enable greater control over datacenter thermal output. We propose two different job placement algorithms for VMT and perform a scale out study of VMT in a simulated server cluster. We provide analysis of the use cases and trade-offs of each algorithm, and show that VMT reduces peak cooling load by up to 12.8% to provide over two million dollars in cost savings when a smaller cooling system is installed, or allows for over 7,000 additional servers to be added in scenarios where TTS is ineffective.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{lin2018adasa, title = {Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features}, author = {Shih-Chieh Lin and Chang-Hong Hsu and Walter Talamonti and Yunqi Zhang and Steve Oney and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/lin2018adasa.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 31st Annual ACM Symposium on User Interface Software and Technology}, pages = {531--542}, abstract = {Advanced Driver Assistance Systems (ADAS) come equipped on most modern vehicles and are intended to assist the driver and enhance the driving experience through features such as lane keeping system and adaptive cruise control. However, recent studies show that few people utilize these features for several reasons. First, ADAS features were not common until recently. Second, most users are unfamiliar with these features and do not know what to expect. Finally, the interface for operating these features is not intuitive. To help drivers understand ADAS features, we present a conversational in-vehicle digital assistant that responds to drivers' questions and commands in natural language. With the system prototyped herein, drivers can ask questions or command using unconstrained natural language in the vehicle, and the assistant trained by using advanced machine learning techniques, coupled with access to vehicle signals, responds in real-time based on conversational context. Results of our system prototyped on a production vehicle are presented, demonstrating its effectiveness in improving driver understanding and usability of ADAS.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{jain2018architectural, title = {Architectural support for convolutional neural networks on modern CPUs}, author = {Animesh Jain and Michael A Laurenzano and Gilles A Pokam and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3243176.3243177.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques}, pages = {1--13}, abstract = {A key focus of recent work in our community has been on devising increasingly sophisticated acceleration devices for deep neural network (DNN) computation, especially for networks driven by convolution layers. Yet, despite the promise of substantial improvements in performance and energy consumption offered by these approaches, general purpose computing is not going away because its traditional well-understood programming model and continued wide deployment. Therefore, the question arises as to what can be done, if anything, to evolve conventional CPUs to accommodate efficient deep neural network computation. This work focuses on the challenging problem of identifying and alleviating the performance bottlenecks for convolution layer computation for conventional CPU platforms. We begin by performing a detailed study of a range of CNN-based applications on a modern CPU microarchitecture, finding that designing a physical register file (PRF) capable of feeding computational units is the primary barrier that prevents the addition of more compute units in the CPU, limiting the performance improvements that can be achieved by CPU on convolution layers. We present the design of a novel, minimally intrusive set of microarchitectural and ISA extensions that address this problem and describe the code generation support needed to take advantage our design. Through a detailed evaluation that covers 5 state-of-the-art neural network applications, we observe that applying these extensions allows packing more compute in the CPU while keeping PRF energy in check, achieving a 2× performance improvement and a 2.7× energy-delay product improvement against a popular Intel Haswell server processor baseline.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @misc{hundt2017allocation, title = {Allocation of tasks in large scale computing systems}, author = {Robert Hundt and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/pat9563532.pdf}, year = {2017}, date = {2017-02-01}, abstract = {Aspects of the invention may be used to allocate tasks among computing machines in large scale computing systems. In one aspect, the method includes executing a first task in the plurality of tasks on a first computing machine and determining a performance degradation threshold for the first task. The method further includes calculating a predicted performance degradation of the first task when a second task is executed on the first computing machine, wherein the predicted performance degradation is determined by comparing a performance interference score of the second task with a performance sensitivity curve of the first task. The method further includes executing the second task on the first computing machine when the predicted performance degradation of the first task is below the performance degradation threshold.}, note = {US Patent 9,563,532}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @article{hsu2017reining, title = {Reining in long tails in warehouse-scale computers with quick voltage boosting using adrenaline}, author = {Chang-Hong Hsu and Yunqi Zhang and Michael A Laurenzano and David Meisner and Thomas Wenisch and Ronald G Dreslinski and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3054742.pdf}, year = {2017}, date = {2017-01-01}, journal = {ACM Transactions on Computer Systems (TOCS)}, volume = {35}, number = {1}, pages = {1--33}, publisher = {ACM New York, NY, USA}, abstract = {Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service (QoS) of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor’s voltage and frequency during a coarse-grained sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer-granularity (tens of nanoseconds) voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50 × tail latency improvement for Memcached and up to a 3.03 × for Web Search over coarse-grained dynamic voltage and frequency scaling (DVFS) given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81 × improvement for Memcached and up to a 1.99 × for Web Search over coarse-grained DVFS. By using the carefully chosen boost thresholds, Adrenaline further improves the tail latency reduction to 4.82 × over coarse-grained DVFS.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @article{kang2017neurosurgeon, title = {Neurosurgeon: Collaborative intelligence between the cloud and mobile edge}, author = {Yiping Kang and Johann Hauswald and Cao Gao and Austin Rovinski and Trevor Mudge and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3037697.3037698.pdf}, year = {2017}, date = {2017-01-01}, journal = {ACM SIGARCH Computer Architecture News}, volume = {45}, number = {1}, pages = {615--629}, publisher = {ACM New York, NY, USA}, abstract = {The computation for today's intelligent personal assistants such as Apple Siri, Google Now, and Microsoft Cortana, is performed in the cloud. This cloud-only approach requires significant amounts of data to be sent to the cloud over the wireless network and puts significant computational pressure on the datacenter. However, as the computational resources in mobile devices become more powerful and energy efficient, questions arise as to whether this cloud-only processing is desirable moving forward, and what are the implications of pushing some or all of this compute to the mobile devices on the edge. In this paper, we examine the status quo approach of cloud-only processing and investigate computation partitioning strategies that effectively leverage both the cycles in the cloud and on the mobile device to achieve low latency, low energy consumption, and high datacenter throughput for this class of intelligent applications. Our study uses 8 intelligent applications spanning computer vision, speech, and natural language domains, all employing state-of-the-art Deep Neural Networks (DNNs) as the core machine learning technique. We find that given the characteristics of DNN algorithms, a fine-grained, layer-level computation partitioning strategy based on the data and computation variations of each layer within a DNN has significant latency and energy advantages over the status quo approach. Using this insight, we design Neurosurgeon, a lightweight scheduler to automatically partition DNN computation between mobile devices and datacenters at the granularity of neural network layers. Neurosurgeon does not require per-application profiling. It adapts to various DNN architectures, hardware platforms, wireless networks, and server load levels, intelligently partitioning computation for best latency or best mobile energy. We evaluate Neurosurgeon on a state-of-the-art mobile development platform and show that it improves end-to-end latency by 3.1X on average and up to 40.7X, reduces mobile energy consumption by 59.5% on average and up to 94.7%, and improves datacenter throughput by 1.5X on average and up to 6.7X.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{chen2017prophet, title = {Prophet: Precise qos prediction on non-preemptive accelerators to improve utilization in warehouse-scale computers}, author = {Quan Chen and Hailong Yang and Minyi Guo and Ram Srivatsa Kannan and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3093336.3037700.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems}, pages = {17--32}, abstract = {Guaranteeing Quality-of-Service (QoS) of latency-sensitive applications while improving server utilization through application co-location is important yet challenging in modern datacenters. The key challenge is that when applications are co-located on a server, performance interference due to resource contention can be detrimental to the application QoS. Although prior work has proposed techniques to identify "safe" co-locations where application QoS is satisfied by predicting the performance interference on multicores, no such prediction technique on accelerators such as GPUs. In this work, we present Prophet, an approach to precisely predict the performance degradation of latency-sensitive applications on accelerators due to application co-location. We analyzed the performance interference on accelerators through a real system investigation and found that unlike on multicores where the key contentious resources are shared caches and main memory bandwidth, the key contentious resources on accelerators are instead processing elements, accelerator memory bandwidth and PCIe bandwidth. Based on this observation, we designed interference models that enable the precise prediction for processing element, accelerator memory bandwidth and PCIe bandwidth contention on real hardware. By using a novel technique to forecast solo-run execution traces of the co-located applications using interference models, Prophet can accurately predict the performance degradation of latency-sensitive applications on non-preemptive accelerators. Using Prophet, we can identify "safe" co-locations on accelerators to improve utilization without violating the QoS target. Our evaluation shows that Prophet can predict the performance degradation with an average prediction error 5.47% on real systems. Meanwhile, based on the prediction, Prophet achieves accelerator utilization improvements of 49.9% on average while maintaining the QoS target of latency-sensitive applications.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{yang2017powerchief, title = {Powerchief: Intelligent power allocation for multi-stage applications to improve responsiveness on power constrained cmp}, author = {Hailong Yang and Quan Chen and Moeiz Riaz and Zhongzhi Luan and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3079856.3080224.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture}, pages = {133--146}, abstract = {Modern user facing applications consist of multiple processing stages with a number of service instances in each stage. The latency profile of these multi-stage applications is intrinsically variable, making it challenging to provide satisfactory responsiveness. Given a limited power budget, improving the end-to-end latency requires intelligently boosting the bottleneck service across stages using multiple boosting techniques. However, prior work fail to acknowledge the multi-stage nature of user-facing applications and perform poorly in improving responsiveness on power constrained CMP, as they are unable to accurately identify bottleneck service and apply the boosting techniques adaptively. In this paper, we present PowerChief, a runtime framework that 1) provides joint design of service and query to monitor the latency statistics across service stages and accurately identifies the bottleneck service during runtime; 2) adaptively chooses the boosting technique to accelerate the bottleneck service with improved responsiveness; 3) dynamically reallocates the constrained power budget across service stages to accommodate the chosen boosting technique. Evaluated with real world multi-stage applications, PowerChief improves the average latency by 20.3x and 32.4x (99% tail latency by 13.3x and 19.4x) for Sirius and Natural Language Processing applications respectively compared to stage-agnostic power allocation. In addition, for the given QoS target, PowerChief reduces the power consumption of Sirius and Web Search applications by 23% and 33% respectively over prior work.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{skach2017thermal, title = {Thermal time shifting: Decreasing datacenter cooling costs with phase change materials}, author = {Matt Skach and Manish Aurora and Chang-Hong Hsu and Qi Li and Dean Tullsen and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/2749469.2749474.pdf}, year = {2017}, date = {2017-01-01}, journal = {IEEE Internet Computing}, publisher = {IEEE}, abstract = {Datacenters, or warehouse scale computers, are rapidly increasing in size and power consumption. However, this growth comes at the cost of an increasing thermal load that must be removed to prevent overheating and server failure. In this paper, we propose to use phase changing materials (PCM) to shape the thermal load of a datacenter, absorbing and releasing heat when it is advantageous to do so. We present and validate a methodology to study the impact of PCM on a datacenter, and evaluate two important opportunities for cost savings. We find that in a datacenter with full cooling system subscription, PCM can reduce the necessary cooling system size by up to 12% without impacting peak throughput, or increase the number of servers by up to 14.6% without increasing the cooling load. In a thermally constrained setting, PCM can increase peak throughput up to 69% while delaying the onset of thermal limits by over 3 hours.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{hill2017deftnn, title = {Deftnn: Addressing bottlenecks for dnn execution on gpus via synapse vector elimination and near-compute data fission}, author = {Parker Hill and Animesh Jain and Mason Hill and Babak Zamirai and Chang-Hong Hsu and Michael A Laurenzano and Scott Mahlke and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3123939.3123970.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture}, pages = {786--799}, abstract = {Deep neural networks (DNNs) are key computational building blocks for emerging classes of web services that interact in real time with users via voice, images and video inputs. Although GPUs have gained popularity as a key accelerator platform for deep learning workloads, the increasing demand for DNN computation leaves a significant gap between the compute capabilities of GPU-enabled datacenters and the compute needed to service demand. The state-of-the-art techniques to improve DNN performance have significant limitations in bridging the gap on real systems. Current network pruning techniques remove computation, but the resulting networks map poorly to GPU architectures, yielding no performance benefit or even slowdowns. Meanwhile, current bandwidth optimization techniques focus on reducing off-chip bandwidth while overlooking on-chip bandwidth, a key DNN bottleneck. To address these limitations, this work introduces DeftNN, a GPU DNN execution framework that targets the key architectural bottlenecks of DNNs on GPUs to automatically and transparently improve execution performance. DeftNN is composed of two novel optimization techniques - (1) synapse vector elimination, a technique that identifies non-contributing synapses in the DNN and carefully transforms data and removes the computation and data movement of these synapses while fully utilizing the GPU to improve performance, and (2) near-compute data fission, a mechanism for scaling down the on-chip data movement requirements within DNN computations. Our evaluation of DeftNN spans 6 state-of-the-art DNNs. By applying both optimizations in concert, DeftNN is able to achieve an average speedup of 2.1X on real GPU hardware. We also introduce a small additional hardware unit per GPU core to facilitate efficient data fission operations, increasing the speedup achieved by DeftNN to 2.6X.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{jain2016cpsa, title = {CPSA: Compute precisely store approximately}, author = {Animesh Jain and Parker Hill and Michael A Laurenzano and Md E Haque and Muneeb Khan and Scott Mahlke and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/jain.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Workshop on Approximate Computing Across the Stack}, abstract = {We propose a new approximate-computing paradigm, where computations are performed precisely while the data is stored approximately in the memory using data packing. This lets us reduce the memory traffic, improving application memory behavior. It achieves 85% memory savings for an accuracy target of 90%.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{hauswald2016designing, title = {Designing future warehouse-scale computers for Sirius, an end-to-end voice and vision personal assistant}, author = {Johann Hauswald and Michael A Laurenzano and Yunqi Zhang and Hailong Yang and Yiping Kang and Cheng Li and Austin Rovinski and Arjun Khurana and Ronald G Dreslinski and Trevor Mudge and others}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/2870631.pdf}, year = {2016}, date = {2016-01-01}, journal = {ACM Transactions on Computer Systems (TOCS)}, volume = {34}, number = {1}, pages = {1--32}, publisher = {ACM New York, NY, USA}, abstract = {As user demand scales for intelligent personal assistants (IPAs) such as Apple’s Siri, Google’s Google Now, and Microsoft’s Cortana, we are approaching the computational limits of current datacenter (DC) architectures. It is an open question how future server architectures should evolve to enable this emerging class of applications, and the lack of an open-source IPA workload is an obstacle in addressing this question. In this article, we present the design of Sirius, an open end-to-end IPA Web-service application that accepts queries in the form of voice and images, and responds with natural language. We then use this workload to investigate the implications of four points in the design space of future accelerator-based server architectures spanning traditional CPUs, GPUs, manycore throughput co-processors, and FPGAs. To investigate future server designs for Sirius, we decompose Sirius into a suite of eight benchmarks (Sirius Suite) comprising the computationally intensive bottlenecks of Sirius. We port Sirius Suite to a spectrum of accelerator platforms and use the performance and power trade-offs across these platforms to perform a total cost of ownership (TCO) analysis of various server design points. In our study, we find that accelerators are critical for the future scalability of IPA services. Our results show that GPU- and FPGA-accelerated servers improve the query latency on average by 8.5× and 15×, respectively. For a given throughput, GPU- and FPGA-accelerated servers can reduce the TCO of DCs by 2.3× and 1.3×, respectively.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @article{chen2016baymax, title = {Baymax: Qos awareness and increased utilization for non-preemptive accelerators in warehouse scale computers}, author = {Quan Chen and Hailong Yang and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/2872362.2872368.pdf}, year = {2016}, date = {2016-01-01}, journal = {ACM SIGPLAN Notices}, volume = {51}, number = {4}, pages = {681--696}, publisher = {ACM New York, NY, USA}, abstract = {Modern warehouse-scale computers (WSCs) are being outfitted with accelerators to provide the significant compute required by emerging intelligent personal assistant (IPA) workloads such as voice recognition, image classification, and natural language processing. It is well known that the diurnal user access pattern of user-facing services provides a strong incentive to co-locate applications for better accelerator utilization and efficiency, and prior work has focused on enabling co-location on multicore processors. However, interference when co-locating applications on non-preemptive accelerators is fundamentally different than contention on multi-core CPUs and introduces a new set of challenges to reduce QoS violation. To address this open problem, we first identify the underlying causes for QoS violation in accelerator-outfitted servers. Our experiments show that queuing delay for the compute resources and PCI-e bandwidth contention for data transfer are the main two factors that contribute to the long tails of user-facing applications. We then present Baymax, a runtime system that orchestrates the execution of compute tasks from different applications and mitigates PCI-e bandwidth contention to deliver the required QoS for user-facing applications and increase the accelerator utilization. Using DjiNN, a deep neural network service, Sirius, an end-to-end IPA workload, and traditional applications on a Nvidia K40 GPU, our evaluation shows that Baymax improves the accelerator utilization by 91.3% while achieving the desired 99%-ile latency target for for user-facing applications. In fact, Baymax reduces the 99%-ile latency of user-facing applications by up to 195x over default execution.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{zhang2016treadmill, title = {Treadmill: Attributing the source of tail latency through precise load testing and statistical inference}, author = {Yunqi Zhang and David Meisner and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/ISCA.2016.47.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)}, pages = {456--468}, organization = {IEEE}, abstract = {Managing tail latency of requests has become one of the primary challenges for large-scale Internet services. Data centers are quickly evolving and service operators frequently desire to make changes to the deployed software and production hardware configurations. Such changes demand a confident understanding of the impact on one's service, in particular its effect on tail latency (e.g., 95th- or 99th-percentile response latency of the service). Evaluating the impact on the tail is challenging because of its inherent variability. Existing tools and methodologies for measuring these effects suffer from a number of deficiencies including poor load tester design, statistically inaccurate aggregation, and improper attribution of effects. As shown in the paper, these pitfalls can often result in misleading conclusions. In this paper, we develop a methodology for statistically rigorous performance evaluation and performance factor attribution for server workloads. First, we find that careful design of the server load tester can ensure high quality performance evaluation, and empirically demonstrate the inaccuracy of load testers in previous work. Learning from the design flaws in prior work, we design and develop a modular load tester platform, Treadmill, that overcomes pitfalls of existing tools. Next, utilizing Treadmill, we construct measurement and analysis procedures that can properly attribute performance factors. We rely on statistically-sound performance evaluation and quantile regression, extending it to accommodate the idiosyncrasies of server systems. Finally, we use our augmented methodology to evaluate the impact of common server hardware features with Facebook production workloads on production hardware. We decompose the effects of these features on request tail latency and demonstrate that our evaluation methodology provides superior results, particularly in capturing complicated and counter-intuitive performance behaviors. By tuning the hardware features as suggested by the attribution, we reduce the 99th-percentile latency by 43% and its variance by 93%.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{laurenzano2016powerchop, title = {Powerchop: Identifying and managing non-critical units in hybrid processor architectures}, author = {Michael A Laurenzano and Yunqi Zhang and Jiang Chen and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3007787.3001152.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)}, pages = {140--152}, organization = {IEEE}, abstract = {On-core microarchitectural structures consume significant portions of a processor's power budget. However, depending on application characteristics, those structures do not always provide (much) performance benefit. While timeout-based power gating techniques have been leveraged for underutilized cores and inactive functional units, these techniques have not directly translated to high-activity units such as vector processing units, complex branch predictors, and caches. The performance benefit provided by these units does not necessarily correspond with unit activity, but instead is a function of application characteristics. This work introduces PowerChop, a novel technique that leverages the unique capabilities of HW/SW co-designed hybrid processors to enact unit-level power management at the application phase level. PowerChop adds two small additional hardware units to facilitate phase identification and triggering different power states, enabling the software layer to cheaply track, predict and take advantage of varying unit criticality across application phases by powering gating units that are not needed for performant execution. Through detailed experimentation, we find that PowerChop significantly decreases power consumption, reducing the leakage power of a hybrid server processor by 9% on average (up to 33%) and a hybrid mobile processor by 19% (up to 40%) while introducing just 2% slowdown.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{laurenzano2016input, title = {Input responsiveness: using canary inputs to dynamically steer approximation}, author = {Michael A Laurenzano and Parker Hill and Mehrzad Samadi and Scott Mahlke and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/2908080.2908087.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation}, pages = {161--176}, abstract = {This paper introduces Input Responsive Approximation (IRA), an approach that uses a canary input — a small program input carefully constructed to capture the intrinsic properties of the original input — to automatically control how program approximation is applied on an input-by-input basis. Motivating this approach is the observation that many of the prior techniques focusing on choosing how to approximate arrive at conservative decisions by discounting substantial differences between inputs when applying approximation. The main challenges in overcoming this limitation lie in making the choice of how to approximate both effectively (e.g., the fastest approximation that meets a particular accuracy target) and rapidly for every input. With IRA, each time the approximate program is run, a canary input is constructed and used dynamically to quickly test a spectrum of approximation alternatives. Based on these runtime tests, the approximation that best fits the desired accuracy constraints is selected and applied to the full input to produce an approximate result. We use IRA to select and parameterize mixes of four approximation techniques from the literature for a range of 13 image processing, machine learning, and data mining applications. Our results demonstrate that IRA significantly outperforms prior approaches, delivering an average of 10.2× speedup over exact execution while minimizing accuracy losses in program outputs.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{jain2016continuous, title = {Continuous shape shifting: Enabling loop co-optimization via near-free dynamic code rewriting}, author = {Animesh Jain and Michael A Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3195638.3195666.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, pages = {1--12}, organization = {IEEE}, abstract = {The class of optimizations characterized by manipulating a loop's interaction space for improved cache locality and reuse (i.e, cache tiling / blocking / strip mine and interchange) are static optimizations requiring a priori information about the microarchitectural and runtime environment of an application binary. However, particularly in datacenter environments, deployed applications face numerous dynamic environments over their lifetimes. As a result, this class of optimizations can result in sub-optimal performance due to the inability to flexibly adapt iteration spaces as cache conditions change at runtime. This paper introduces continuous shape shifiting, a compilation approach that removes the risks of cache tiling optimizations by dynamically rewriting (and reshaping) deployed, running application code. To realize continuous shape shifting, we present ShapeShifter, a framework for continuous monitoring of co-running applications and their runtime environments to reshape loop iteration spaces and pinpoint near-optimal loop tile configurations. Upon identifying a need for reshaping, a new tiling approach is quickly constructed for the application, new code is dynamically generated and is then seamlessly stitched into the running application with near-zero overhead. Our evaluation on a wide spectrum of runtime scenarios demonstrates that ShapeShifter achieves an average of 10--40% performance improvement (up to 2.4X) on real systems depending on the runtime environment compared to an oracle static loop tiling baseline.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{jain2016concise, title = {Concise loads and stores: The case for an asymmetric compute-memory architecture for approximation}, author = {Animesh Jain and Parker Hill and Shih-Chieh Lin and Muneeb Khan and Md E Haque and Michael A Laurenzano and Scott Mahlke and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3195638.3195688.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, pages = {1--13}, organization = {IEEE}, abstract = {Cache capacity and memory bandwidth play critical roles in application performance, particularly for data-intensive applications from domains that include machine learning, numerical analysis, and data mining. Many of these applications are also tolerant to imprecise inputs and have loose constraints on the quality of output, making them ideal candidates for approximate computing. This paper introduces a novel approximate computing technique that decouples the format of data in the memory hierarchy from the format of data in the compute subsystem to significantly reduce the cost of storing and moving bits throughout the memory hierarchy and improve application performance. This asymmetric compute-memory extension to conventional architectures, ACME, adds two new instruction classes to the ISA - load-concise and store-concise - along with three small functional units to the micro-architecture to support these instructions. ACME does not affect exact execution of applications and comes into play only when concise memory operations are used. Through detailed experimentation we find that ACME is very effective at trading result accuracy for improved application performance. Our results show that ACME achieves a 1.3X speedup (up to 1.8X) while maintaining 99% accuracy, or a 1.1X speedup while maintaining 99.999% accuracy. Moreover, our approach incurs negligible area and power overheads, adding just 0.005% area and 0.1% power to a conventional modern architecture.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{zekany2016crystalball, title = {CrystalBall: Statically analyzing runtime behavior via deep sequence learning}, author = {Stephen Zekany and Daniel Rings and Nathan Harada and Michael A Laurenzano and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3195638.3195667.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, pages = {1--12}, organization = {IEEE}, abstract = {Understanding dynamic program behavior is critical in many stages of the software development lifecycle, for purposes as diverse as optimization, debugging, testing, and security. This paper focuses on the problem of predicting dynamic program behavior statically. We introduce a novel technique to statically identify hot paths that leverages emerging deep learning techniques to take advantage of their ability to learn subtle, complex relationships between sequences of inputs. This approach maps well to the problem of identifying the behavior of sequences of basic blocks in program execution. Our technique is also designed to operate on the compiler's intermediate representation (IR), as opposed to the approaches taken by prior techniques that have focused primarily on source code, giving our approach language-independence. We describe the pitfalls of conventional metrics used for hot path prediction such as accuracy, and motivate the use of Area Under the Receiver Operating Characteristic curve (AUROC). Through a thorough evaluation of our technique on complex applications that include the SPEC CPU2006 benchmarks, we show that our approach achieves an AUROC of 0.85.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{hauswald2016sirius, title = {Sirius implications for future warehouse-scale computers}, author = {Johann Hauswald and Michael A Laurenzano and Yunqi Zhang and Cheng Li and Austin Rovinski and Arjun Khurana and Ronald G Dreslinski and Trevor Mudge and Vinicius Petrucci and Lingjia Tang and others}, url = {https://www.jasonmars.org/wp-content/uploads/2020/04/07478443.pdf}, year = {2016}, date = {2016-01-01}, journal = {IEEE Micro}, volume = {36}, number = {3}, pages = {42--53}, publisher = {IEEE}, abstract = {Demand is expected to grow significantly for cloud services that deliver sophisticated artificial intelligence on the critical path of user queries, as is the case with intelligent personal assistants such as Apple's Siri. If the prediction of the trend is correct, these types of applications will likely consume most of the world's computing cycles. The Sirius project was motivated to investigate what this future might look like and how cloud architectures should evolve to achieve it.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{petrucci2015octopus, title = {Octopus-Man: QoS-driven task management for heterogeneous multicores in warehouse-scale computers}, author = {Vinicius Petrucci and Michael A Laurenzano and John Doherty and Yunqi Zhang and Daniel Mosse and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07056037.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)}, pages = {246--258}, organization = {IEEE}, abstract = {Heterogeneous multicore architectures have the potential to improve energy efficiency by integrating power-efficient wimpy cores with high-performing brawny cores. However, it is an open question as how to deliver energy reduction while ensuring the quality of service (QoS) of latency-sensitive web-services running on such heterogeneous multicores in warehouse-scale computers (WSCs). In this work, we first investigate the implications of heterogeneous multicores in WSCs and show that directly adopting heterogeneous multicores without re-designing the software stack to provide QoS management leads to significant QoS violations. We then present Octopus-Man, a novel QoS-aware task management solution that dynamically maps latency-sensitive tasks to the least power-hungry processing resources that are sufficient to meet the QoS requirements. Using carefully-designed feedback-control mechanisms, Octopus-Man addresses critical challenges that emerge due to uncertainties in workload fluctuations and adaptation dynamics in a real system. Our evaluation using web-search and memcached running on a real-system Intel heterogeneous prototype demonstrates that Octopus-Man improves energy efficiency by up to 41% (CPU power) and up to 15% (system power) over an all-brawny WSC design while adhering to specified QoS targets.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{hsu2015adrenaline, title = {Adrenaline: Pinpointing and reining in tail queries with quick voltage boosting}, author = {Chang-Hong Hsu and Yunqi Zhang and Michael A Laurenzano and David Meisner and Thomas Wenisch and Jason Mars and Lingjia Tang and Ronald G Dreslinski}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07056039.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)}, pages = {271--282}, organization = {IEEE}, abstract = {Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor's voltage and frequency during a coarse-grain sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer granularity, 10's of nanoseconds, voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; and second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50x tail latency improvement for Memcached and up to a 3.03x for Web Search over coarse-grained DVFS given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81x improvement for Memcached and up to a 1.99x for Web Search over coarse-grained DVFS.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{hauswald2015sirius, title = {Sirius: An open end-to-end voice and vision personal assistant and its implications for future warehouse scale computers}, author = {Johann Hauswald and Michael A Laurenzano and Yunqi Zhang and Cheng Li and Austin Rovinski and Arjun Khurana and Ronald G Dreslinski and Trevor Mudge and Vinicius Petrucci and Lingjia Tang and others}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2694344.2694347.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the Twentieth International Conference on Architectural Support for Programming Languages and Operating Systems}, pages = {223--238}, abstract = {As user demand scales for intelligent personal assistants (IPAs) such as Apple's Siri, Google's Google Now, and Microsoft's Cortana, we are approaching the computational limits of current datacenter architectures. It is an open question how future server architectures should evolve to enable this emerging class of applications, and the lack of an open-source IPA workload is an obstacle in addressing this question. In this paper, we present the design of Sirius, an open end-to-end IPA web-service application that accepts queries in the form of voice and images, and responds with natural language. We then use this workload to investigate the implications of four points in the design space of future accelerator-based server architectures spanning traditional CPUs, GPUs, manycore throughput co-processors, and FPGAs. To investigate future server designs for Sirius, we decompose Sirius into a suite of 7 benchmarks (Sirius Suite) comprising the computationally intensive bottlenecks of Sirius. We port Sirius Suite to a spectrum of accelerator platforms and use the performance and power trade-offs across these platforms to perform a total cost of ownership (TCO) analysis of various server design points. In our study, we find that accelerators are critical for the future scalability of IPA services. Our results show that GPU- and FPGA-accelerated servers improve the query latency on average by 10x and 16x. For a given throughput, GPU- and FPGA-accelerated servers can reduce the TCO of datacenters by 2.6x and 1.4x, respectively.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{skach2015thermal, title = {Thermal time shifting: Leveraging phase change materials to reduce cooling costs in warehouse-scale computers}, author = {Matt Skach and Manish Arora and Chang-Hong Hsu and Qi Li and Dean Tullsen and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07284085.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 42nd Annual International Symposium on Computer Architecture}, pages = {439--449}, abstract = {Datacenters, or warehouse scale computers, are rapidly increasing in size and power consumption. However, this growth comes at the cost of an increasing thermal load that must be removed to prevent overheating and server failure. In this paper, we propose to use phase changing materials (PCM) to shape the thermal load of a datacenter, absorbing and releasing heat when it is advantageous to do so. We present and validate a methodology to study the impact of PCM on a datacenter, and evaluate two important opportunities for cost savings. We find that in a datacenter with full cooling system subscription, PCM can reduce the necessary cooling system size by up to 12% without impacting peak throughput, or increase the number of servers by up to 14.6% without increasing the cooling load. In a thermally constrained setting, PCM can increase peak throughput up to 69% while delaying the onset of thermal limits by over 3 hours.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{hauswald2015djinn, title = {DjiNN and Tonic: DNN as a service and its implications for future warehouse scale computers}, author = {Johann Hauswald and Yiping Kang and Michael A Laurenzano and Quan Chen and Cheng Li and Trevor Mudge and Ronald G Dreslinski and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07284053.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {2015 ACM/IEEE 42nd Annual International Symposium on Computer Architecture (ISCA)}, pages = {27--40}, organization = {IEEE}, abstract = {As applications such as Apple Siri, Google Now, Microsoft Cortana, and Amazon Echo continue to gain traction, webservice companies are adopting large deep neural networks (DNN) for machine learning challenges such as image processing, speech recognition, natural language processing, among others. A number of open questions arise as to the design of a server platform specialized for DNN and how modern warehouse scale computers (WSCs) should be outfitted to provide DNN as a service for these applications. In this paper, we present DjiNN, an open infrastructure for DNN as a service in WSCs, and Tonic Suite, a suite of 7 end-to-end applications that span image, speech, and language processing. We use DjiNN to design a high throughput DNN system based on massive GPU server designs and provide insights as to the varying characteristics across applications. After studying the throughput, bandwidth, and power properties of DjiNN and Tonic Suite, we investigate several design points for future WSC architectures. We investigate the total cost of ownership implications of having a WSC with a disaggregated GPU pool versus a WSC composed of homogeneous integrated GPU servers. We improve DNN throughput by over 120× for all but one application (40× for Facial Recognition) on an NVIDIA K40 GPU. On a GPU server composed of 8 NVIDIA K40s, we achieve near-linear scaling (around 1000× throughput improvement) for 3 of the 7 applications. Through our analysis, we also find that GPU-enabled WSCs improve total cost of ownership over CPU-only designs by 4-20×, depending on the composition of the workload.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{khan2015arep, title = {AREP: Adaptive resource efficient prefetching for maximizing multicore performance}, author = {Muneeb Khan and Michael A Laurenzanoy and Jason Marsy and Erik Hagersten and David Black-Schaffer}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07429320.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {2015 International Conference on Parallel Architecture and Compilation (PACT)}, pages = {367--378}, organization = {IEEE}, abstract = {Modern processors widely use hardware prefetching to hide memory latency. While aggressive hardware prefetchers can improve performance significantly for some applications, they can limit the overall performance in highly-utilized multicore processors by saturating the offchip bandwidth and wasting last-level cache capacity. Co-executing applications can slowdown due to contention over these shared resources. This work introduces Adaptive Resource Efficient Prefetching (AREP) -- a runtime framework that dynamically combines software prefetching and hardware prefetching to maximize throughput in highly utilized multicore processors. AREP achieves better performance by prefetching data in a resource efficient way -- conserving offchip-bandwidth and last-level cache capacity with accurate prefetching and by applying cache-bypassing when possible. AREP dynamically explores a mix of hardware/software prefetching policies, then selects and applies the best performing policy. AREP is phase-aware and re-explores (at runtime) for the best prefetching policy at phase boundaries. A multitude of experiments with workload mixes and parallel applications on a modern high performance multicore show that AREP can increase throughput by up to 49% (8.1% on average). This is complemented by improved fairness, resulting in average quality of service above 94%.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{zhai2014happy, title = {Happy: Hyperthread-aware power profiling dynamically}, author = {Yan Zhai and Xiao Zhang and Stephane Eranian and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/atc14-paper-zhai.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {2014 USENIX Annual Technical Conference (USENIX ATC 2014)}, pages = {211--217}, abstract = {Quantifying the power consumption of individual applications co-running on a single server is a critical component for software-based power capping, scheduling, and provisioning techniques in modern datacenters. However, with the proliferation of hyperthreading in the last few generations of server-grade processor designs, the challenge of accurately and dynamically performing this power attribution to individual threads has been significantly exacerbated. Due to the sharing of core-level resources such as functional units, prior techniques are not suitable to attribute the power consumption between hyperthreads sharing a physical core. In this paper, we present a runtime mechanism that quantifies and attributes power consumption to individual jobs at fine granularity. Specifically, we introduce a hyperthread-aware power model that differentiates between the states when both hardware threads of a core are in use, and when only one thread is in use. By capturing these two different states, we are able to accurately attribute power to each logical CPU in modern servers. We conducted experiments with several Google production workloads on an Intel Sandy Bridge server. Compared to prior hyperthread-oblivious model, HaPPy is substantially more accurate, reducing the prediction error from 20.5% to 7.5% on average and from 31.5% to 9.4% in the worst case.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{zhang2014smite, title = {Smite: Precise qos prediction on real-system smt processors to improve utilization in warehouse scale computers}, author = {Yunqi Zhang and Michael A Laurenzano and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07011405.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {2014 47th Annual IEEE/ACM International Symposium on Microarchitecture}, pages = {406--418}, organization = {IEEE}, abstract = {One of the key challenges for improving efficiency in warehouse scale computers (WSCs) is to improve server utilization while guaranteeing the quality of service (QoS) of latency-sensitive applications. To this end, prior work has proposed techniques to precisely predict performance and QoS interference to identify 'safe' application co-locations. However, such techniques are only applicable to resources shared across cores. Achieving such precise interference prediction on real-system simultaneous multithreading (SMT) architectures has been a significantly challenging open problem due to the complexity introduced by sharing resources within a core. In this paper, we demonstrate through a real-system investigation that the fundamental difference between resource sharing behaviors on CMP and SMT architectures calls for a redesign of the way we model interference. For SMT servers, the interference on different shared resources, including private caches, memory ports, as well as integer and floating-point functional units, do not correlate with each other. This insight suggests the necessity of decoupling interference into multiple resource sharing dimensions. In this work, we propose SMiTe, a methodology that enables precise performance prediction for SMT co-location on real-system commodity processors. With a set of Rulers, which are carefully designed software stressors that apply pressure to a multidimensional space of shared resources, we quantify application sensitivity and contentiousness in a decoupled manner. We then establish a regression model to combine the sensitivity and contentiousness in different dimensions to predict performance interference. Using this methodology, we are able to precisely predict the performance interference in SMT co-location with an average error of 2.80% on SPEC CPU2006 and 1.79% on Cloud Suite. Our evaluation shows that SMiTe allows us to improve the utilization of WSCs by up to 42.57% while enforcing an application's QoS requirements.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{laurenzano2014protean, title = {Protean code: Achieving near-free online code transformations for warehouse scale computers}, author = {M Laurenzano and Yunqi Zhang and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/MICRO.2014.21.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, abstract = {Rampant dynamism due to load fluctuations, co-runner changes, and varying levels of interference poses a threat to application quality of service (QoS) and has limited our ability to allow co-locations in modern warehouse scale computers (WSCs). Instruction set features such as the non-temporal memory access hints found in modern ISAs (both ARM and x86) may be useful in mitigating these effects. However, despite the challenge of this dynamism and the availability of an instruction set mechanism that might help address the problem, a key capability missing in the system software stack in modern WSCs is the ability to dynamically transform (and re-transform) the executing application code to apply these instruction set features when necessary. In this work we introduce protean code, a novel approach for enacting arbitrary compiler transformations at runtime for native programs running on commodity hardware with negligible (<1%) overhead. The fundamental insight behind the underlying mechanism of protean code is that, instead of maintaining full control throughout the program's execution as with traditional dynamic optimizers, protean code allows the original binary to execute continuously and diverts control flow only at a set of virtualized points, allowing rapid and seamless rerouting to the new code variants. In addition, the protean code compiler embeds IR with high-level semantic information into the program, empowering the dynamic compiler to perform rich analysis and transformations online with little overhead. Using a fully functional protean code compiler and runtime built on LLVM, we design PC3D, Protean Code for Cache Contention in Datacenters. PC3D dynamically employs non-temporal access hints to achieve utilization improvements of up to 2.8x (1.5x on average) higher than state-of-the-art contention mitigation runtime techniques at a QoS target of 98%.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{breslow2014enabling, title = {Enabling fair pricing on high performance computer systems with node sharing}, author = {Alex D Breslow and Ananta Tiwari and Martin Schulz and Laura Carrington and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2503210.2503256.pdf}, year = {2014}, date = {2014-01-01}, journal = {Scientific Programming}, volume = {22}, number = {2}, pages = {59--74}, publisher = {IOS Press}, abstract = {Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10--20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location.This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism --a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners --to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @article{tang2013reqos, title = {Reqos: Reactive static/dynamic compilation for qos in warehouse scale computers}, author = {Lingjia Tang and Jason Mars and Wei Wang and Tanima Dey and Mary Lou Soffa}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2490301.2451126.pdf}, year = {2013}, date = {2013-01-01}, journal = {ACM SIGPLAN Notices}, volume = {48}, number = {4}, pages = {89--100}, publisher = {ACM New York, NY, USA}, abstract = {As multicore processors with expanding core counts continue to dominate the server market, the overall utilization of the class of datacenters known as warehouse scale computers (WSCs) depends heavily on colocation of multiple workloads on each server to take advantage of the computational power provided by modern processors. However, many of the applications running in WSCs, such as websearch, are user-facing and have quality of service (QoS) requirements. When multiple applications are co-located on a multicore machine, contention for shared memory resources threatens application QoS as severe cross-core performance interference may occur. WSC operators are left with two options: either disregard QoS to maximize WSC utilization, or disallow the co-location of high-priority user-facing applications with other applications, resulting in low machine utilization and millions of dollars wasted. This paper presents ReQoS, a static/dynamic compilation approach that enables low-priority applications to adaptively manipulate their own contentiousness to ensure the QoS of high-priority co-runners. ReQoS is composed of a profile guided compilation technique that identifies and inserts markers in contentious code regions in low-priority applications, and a lightweight runtime that monitors the QoS of high-priority applications and reactively reduces the pressure low-priority applications generate to the memory subsystem when cross-core interference is detected. In this work, we show that ReQoS can accurately diagnose contention and significantly reduce performance interference to ensure application QoS. Applying ReQoS to SPEC2006 and SmashBench workloads on real multicore machines, we are able to improve machine utilization by more than 70% in many cases, and more than 50% on average, while enforcing a 90% QoS threshold. We are also able to improve the energy efficiency of modern multicore machines by 47% on average over a policy of disallowing co-locations.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{tang2013optimizing, title = {Optimizing Google's warehouse scale computers: The NUMA experience}, author = {Lingjia Tang and Jason Mars and Xiao Zhang and Robert Hagmann and Robert Hundt and Eric Tune}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/06522318.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {2013 IEEE 19th International Symposium on High Performance Computer Architecture (HPCA)}, pages = {188--197}, organization = {IEEE}, abstract = {Due to the complexity and the massive scale of modern warehouse scale computers (WSCs), it is challenging to quantify the performance impact of individual microarchitectural properties and the potential optimization benefits in the production environment. As a result of these challenges, there is currently a lack of understanding of the microarchitecture-workload interaction, leaving potentially significant performance on the table. This paper argues for a two-phase performance analysis methodology for optimizing WSCs that combines both an in-production investigation and an experimental load-testing study. To demonstrate the effectiveness of this two-phase approach, and to illustrate the challenges, methodologies and opportunities in optimizing modern WSCs, this paper investigates the impact of non-uniform memory access (NUMA) for several Google's key web-service workloads in large-scale production WSCs. Leveraging a newly-designed metric and continuous large-scale profiling in live datacenters, our production analysis demonstrates that NUMA has a significant impact (10-20%) on two important web-services: Gmail backend and web-search frontend. Our carefully designed load-test further reveals surprising tradeoffs between optimizing for NUMA performance and reducing cache contention.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{yang2013bubble, title = {Bubble-flux: Precise online qos management for increased utilization in warehouse scale computers}, author = {Hailong Yang and Alex Breslow and Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2508148.2485974.pdf}, year = {2013}, date = {2013-01-01}, journal = {ACM SIGARCH Computer Architecture News}, volume = {41}, number = {3}, pages = {607--618}, publisher = {ACM New York, NY, USA}, abstract = {Ensuring the quality of service (QoS) for latency-sensitive applications while allowing co-locations of multiple applications on servers is critical for improving server utilization and reducing cost in modern warehouse-scale computers (WSCs). Recent work relies on static profiling to precisely predict the QoS degradation that results from performance interference among co-running applications to increase the number of "safe" co-locations. However, these static profiling techniques have several critical limitations: 1) a priori knowledge of all workloads is required for profiling, 2) it is difficult for the prediction to capture or adapt to phase or load changes of applications, and 3) the prediction technique is limited to only two co-running applications. To address all of these limitations, we present Bubble-Flux, an integrated dynamic interference measurement and online QoS management mechanism to provide accurate QoS control and maximize server utilization. Bubble-Flux uses a Dynamic Bubble to probe servers in real time to measure the instantaneous pressure on the shared hardware resources and precisely predict how the QoS of a latency-sensitive job will be affected by potential co-runners. Once "safe" batch jobs are selected and mapped to a server, Bubble-Flux uses an Online Flux Engine to continuously monitor the QoS of the latency-sensitive application and control the execution of batch jobs to adapt to dynamic input, phase, and load changes to deliver satisfactory QoS. Batch applications remain in a state of flux throughout execution. Our results show that the utilization improvement achieved by Bubble-Flux is up to 2.2x better than the prior static approach.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{mars2013whare, title = {Whare-map: heterogeneity in "homogeneous" warehouse-scale computers}, author = {Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2508148.2485975.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the 40th Annual International Symposium on Computer Architecture}, pages = {619--630}, abstract = {Modern "warehouse scale computers" (WSCs) continue to be embraced as homogeneous computing platforms. However, due to frequent machine replacements and upgrades, modern WSCs are in fact composed of diverse commodity microarchitectures and machine configurations. Yet, current WSCs are architected with the assumption of homogeneity, leaving a potentially significant performance opportunity unexplored. In this paper, we expose and quantify the performance impact of the "homogeneity assumption" for modern production WSCs using industry-strength large-scale web-service workloads. In addition, we argue for, and evaluate the benefits of, a heterogeneity-aware WSC using commercial web-service production workloads including Google's web-search. We also identify key factors impacting the available performance opportunity when exploiting heterogeneity and introduce a new metric, opportunity factor, to quantify an application's sensitivity to the heterogeneity in a given WSC. To exploit heterogeneity in "homogeneous" WSCs, we propose "Whare-Map," the WSC Heterogeneity Aware Mapper that leverages already in-place continuous profiling subsystems found in production environments. When employing "Whare-Map", we observe a cluster-wide performance improvement of 15% on average over heterogeneity--oblivious job placement and up to an 80% improvement for web-service applications that are particularly sensitive to heterogeneity.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @incollection{mars2013understanding, title = {Understanding application contentiousness and sensitivity on modern multicores}, author = {Jason Mars and Lingjia Tang}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/1-s2.0-B9780124080898000021-main.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Advances in Computers}, volume = {91}, pages = {59--85}, publisher = {Elsevier}, abstract = {Runtime systems to mitigate memory resource contention problems on multicore processors have recently attracted much research attention. One critical component of these runtimes is the indicators to rank and classify applications based on their contention characteristics. However, although there has been significant research effort, application contention characteristics remain not well understood and indicators have not been thoroughly evaluated. In this chapter, we performed a thorough study of applications' contention characteristics to develop better indicators to improve contention-aware runtime systems. The contention characteristics are composed of an application's contentiousness, and its sensitivity to contention. We show that contentiousness and sensitivity are not strongly correlated, and contrary to prior wisdom, a single indicator is not adequate to predict both. Also, while prior wisdom has relied on last level cache miss rate as one of the best indicators to predict an application's contention characteristics, we show that depending on the workloads, it can often be misleading. We then present prediction models that consider contention in various memory resources. Our regression analysis establishes an accurate model to predict application contentiousness. The analysis also demonstrates that performance counters alone may not be sufficient to accurately predict application sensitivity to contention. In this chapter, we also present an evaluation using SPEC CPU2006 benchmarks showing that, when predicting an application's contentiousness, the linear correlation coefficient R2 of our predictor and the real measured contentiousness is 0.834, as opposed to 0.224 when using last level cache miss rate.}, keywords = {Pub}, pubstate = {published}, tppubtype = {incollection} } @inproceedings{breslow2013enabling, title = {Enabling fair pricing on HPC systems with node sharing}, author = {Alex D Breslow and Ananta Tiwari and Martin Schulz and Laura Carrington and Lingjia Tang and Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2503210.2503256.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the international conference on high performance computing, networking, storage and analysis}, pages = {1--12}, abstract = {Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10 to 20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location. This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism -- a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners -- to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @misc{mars2013scenario, title = {Scenario based optimization}, author = {Jason Mars and Robert Hundt}, url = {https://www.jasonmars.org/wp-content/uploads/2020/05/US8578355.pdf}, year = {2013}, date = {2013-01-01}, abstract = {Techniques and systems for scenario based optimization can include generating multiple different versions of a program segment based on different respective execution scenarios associated with an execution of a program, the program operable to use the program segment versions. In another aspect, techniques and systems can include executing a program executable associated with multiple different versions of a program segment, analyzing the execution for an indication of at least one of the execution scenarios to select one of the program segment versions based on the indication, and causing the execution to use the selected program segment version during at least a portion of the execution.}, note = {US Patent 8,578,355}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} } @inproceedings{tang2012compiling, title = {Compiling for niceness: Mitigating contention for qos in warehouse scale computers}, author = {Lingjia Tang and Jason Mars and Mary Lou Soffa}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/Compiling_for_niceness_Mitigating_contention_for_Q.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the Tenth International Symposium on Code Generation and Optimization}, pages = {1--12}, abstract = {As the class of datacenters recently coined as warehouse scale computers (WSCs) continues to leverage commodity multicore processors with increasing core counts, there is a growing need to consolidate various workloads on these machines to fully utilize their computation power. However, it is well known that when multiple applications are co-located on a multicore machine, contention for shared memory resources can cause severe cross-core performance interference. To ensure that the quality of service (QoS) of user-facing applications does not suffer from performance interference, WSC operators resort to disallowing co-location of latency-sensitive applications with other applications. This policy translates to low machine utilization and millions of dollars wasted in WSCs. This paper presents QoS-Compile, the first compilation approach that statically manipulates application contentiousness to enable the co-location of applications with varying QoS requirements, and as a result, can greatly improve machine utilization. Our technique first pinpoints an application's code regions that tend to cause contention and performance interference. QoS-Compile then transforms those regions to reduce their contentious nature. In essence, to co-locate applications of different QoS priorities, our compilation technique uses pessimizing transformations to throttle down the memory access rate of the contentious regions in low priority applications to reduce their interference to high priority applications. Our evaluation using synthetic benchmarks, SPEC benchmarks and large-scale Google applications show that QoS-Compile can greatly reduce contention, improve QoS of applications, and improve machine utilization. Our experiments show that our technique improves applications' QoS performance by 21% and machine utilization by 36% on average.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{mars2012increasing, title = {Increasing utilization in modern warehouse-scale computers using bubble-up}, author = {Jason Mars and Lingjia Tang and Kevin Skadron and Mary Lou Soffa and Robert Hundt}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/Increasing_Utilization_in_Modern_Warehouse-Scale_C.pdf}, year = {2012}, date = {2012-01-01}, journal = {IEEE Micro}, volume = {32}, number = {3}, pages = {88--99}, publisher = {IEEE}, abstract = {Precisely predicting performance degradation due to co-locating multiple executing applications on a single machine is critical for improving utilization in modern warehouse-scale computers (WSCs). Bubble-Up is the first mechanism for such precise prediction. As opposed to over-provisioning machines, Bubble-Up enables the safe colocation of multiple workloads on a single machine for Web service applications that have quality of service constraints, thus greatly improving machine utilization in modern WSCs.}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{wang2012performance, title = {Performance analysis of thread mappings with a holistic view of the hardware resources}, author = {Wei Wang and Tanima Dey and Jason Mars and Lingjia Tang and Jack W Davidson and Mary Lou Soffa}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/10.1.1.384.6879.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {2012 IEEE International Symposium on Performance Analysis of Systems & Software}, pages = {156--167}, organization = {IEEE}, abstract = {With the shift to chip multiprocessors, managing shared resources has become a critical issue in realizing their full potential. Previous research has shown that thread mapping is a powerful tool for resource management. However, the difficulty of simultaneously managing multiple hardware resources and the varying nature of the workloads have impeded the efficiency of thread mapping algorithms. To overcome the difficulties of simultaneously managing multiple resources with thread mapping, the interaction between various microarchitectural resources and thread characteristics must be well understood. This paper presents an in-depth analysis of PARSEC benchmarks running under different thread mappings to investigate the interaction of various thread mappings with microarchitectural resources including, L1 I/D-caches, I/D TLBs, L2 caches, hardware prefetchers, off-chip memory interconnects, branch predictors, memory disambiguation units and the cores. For each resource, the analysis provides guidelines for how to improve its utilization when mapping threads with different characteristics. We also analyze how the relative importance of the resources varies depending on the workloads. Our experiments show that when only memory resources are considered, thread mapping improves an application's performance by as much as 14% over the default Linux scheduler. In contrast, when both memory and processor resources are considered the mapping algorithm achieves performance improvements by as much as 28%. Additionally, we demonstrate that thread mapping should consider L2 caches, prefetchers and off-chip memory interconnects as one resource, and we present a new metric called L2-misses-memory-latency-product (L2MP) for evaluating their aggregated performance impact.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{walcott2012theme, title = {THeME: a system for testing by hardware monitoring events}, author = {Kristen Walcott-Justice and Jason Mars and Mary Lou Soffa}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/THeME_a_system_for_testing_by_hardware_m.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the 2012 International Symposium on Software Testing and Analysis}, pages = {12--22}, abstract = {The overhead of test coverage analysis is dominated by monitoring the application, which is traditionally performed using instrumentation. However, instrumentation can prohibitively increase the time and especially the memory overhead of an application. As an alternative to instrumentation, we explore how recent hardware advances can be leveraged to improve the overheads of test coverage analysis. These hardware advances include hardware performance monitors and multicore technology. In this work, we present our system, THeME, a testing framework that replaces instrumentation with hardware monitoring. THeME consists of a runtime system that takes advantage of hardware mechanisms and multiple cores and a static component to further extend the coverage derived from hardware event sampling. The results show that up to 90% of the actual coverage can be determined with less time overhead and negligible code growth compared to instrumentation.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{mars2012blockchop, title = {BlockChop: dynamic squash elimination for hybrid processor architecture}, author = {Jason Mars and Naveen Kumar}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/mars12isca.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {2012 39th Annual International Symposium on Computer Architecture (ISCA)}, pages = {536--547}, organization = {IEEE}, abstract = {Hybrid processors are HW/SW co-designed processors that leverage blocked-execution, the execution of regions of instructions as atomic blocks, to facilitate aggressive speculative optimization. As we move to a multicore hybrid design, fine grained conflicts for shared data can violate the atomicity requirement of these blocks and lead to expensive squashes and rollbacks. However, as these atomic regions differ from those used in checkpointing and transactional memory systems, the extent of this potentially prohibitive problem remains unclear, and mechanisms to mitigate these squashes dynamically may be critical to enable a highly performant multicore hybrid design. In this work, we investigate how multithreaded applications, both benchmark and commercial workloads, are affected by squashes, and present dynamic mechanisms for mitigating these squashes in hybrid processors. While the current wisdom is that there is not a significant number of squashes for smaller atomic regions, we observe this is not the case for many multithreaded workloads. With region sizes of just 200 - 500 instructions, we observe a performance degradation ranging from 10% to more than 50% for workloads with a mixture of shared reads and writes. By harnessing the unique flexibility provided by the software subsystem of hybrid processor design, we present BlockChop, a framework for dynamically mitigating squashes on multicore hybrid processors. We present a range of squash handling mechanisms leveraging retrials, interpretation, and retranslation, and find that BlockChop is quite effective. Over the current response to exceptions and squashes in a hybrid design, we are able to improve the performance of benchmark and commercial workloads by 1.4x and 1.2x on average for large and small region sizes respectively.}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @phdthesis{mars2012rethinking, title = {Rethinking the architecture of warehouse-scale computers}, author = {Jason Mars}, url = {https://www.jasonmars.org/wp-content/uploads/2020/12/10.1.1.298.5501.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Citeseer}, abstract = {As the world’s computation continues to move into the massive datacenter infrastructures recently coined as “warehouse-scale computers” (WSCs), developing highly efficient systems for these computing platforms has become increasingly critical. The architecture of modern WSCs remains in its relative infancy. In designing modern WSCs, architects start with commodity off-the-shelf components including commodity pro- cessors and open source system software components. These components are then stitched together to design a simple and cost effective WSC. While this approach has been effective for producing systems that are functional and can scale the delivery of web-services as de- mand increases, efficiency has suffered. The commodity components and system software used have not been designed and refined with the unique characteristics of WSCs in mind, and these characteristics may be critical for a highly efficient WSC design. As such, we must rethink the architecture of modern WSCs. This dissertation argues that one such characteristic has been overlooked: the diversity in execution environments in modern WSCs. We define a given task’s execution environ- ment as the coupling of the machine configuration and the co-running tasks simultaneously executing alongside the given task. At any given time in a WSC, we have a high degree of diversity across these execution environments. This dissertation argues that acknowledging, exploiting, and adapting to the diversity in execution environments are critical for the de- sign of a highly efficient WSC. When ignoring this diversity, three critical design problems arise, including 1) the homogeneous assumption, where all machines and cores in a WSC are assumed to be equal and managed accordingly, 2) the rigidness of applications, where application binaries can not adapt to changes across and within execution environments, and 3) the oblivion of interference, where interference between tasks within an execution environment can not be measured or managed. This dissertation addresses each of these three design problems. First, we address the homogeneous assumption at the cluster level by redesigning the task manager in the WSC to learn which execution environments tasks prefer and map them accordingly. Second, we address the rigidness of applications at the machine level by providing a mechanism to allow applications to adapt to their execution environment, then leverage this mechanism to solve pressing problems in WSCs. Lastly, we address the oblivion of interference at both the cluster and machine levels by providing novel metrics and techniques for measuring and managing interference to improve the utilization of WSCs. By incorporating an awareness of the diversity in execution environments in these three key design areas, we produce a WSC design that is significantly more efficient in both the performance of the applications that live in this domain and the utilization of compute resources in the WSC. By improving efficiency for these two metrics, we effectively require a smaller WSC for some fixed workload, which has implications on reducing not only the cost of these systems, but also their environmental footprint.}, keywords = {Pub}, pubstate = {published}, tppubtype = {phdthesis} } @inproceedings{tang2011impact, title = {The impact of memory subsystem resource sharing on datacenter applications}, author = {Lingjia Tang and Jason Mars and Neil Vachharajani and Robert Hundt and Mary Lou Soffa}, year = {2011}, date = {2011-01-01}, booktitle = {2011 38th Annual International Symposium on Computer Architecture (ISCA)}, pages = {283--294}, organization = {IEEE}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{mars2011bubble, title = {Bubble-up: Increasing utilization in modern warehouse scale computers via sensible co-locations}, author = {Jason Mars and Lingjia Tang and Robert Hundt and Kevin Skadron and Mary Lou Soffa}, year = {2011}, date = {2011-01-01}, booktitle = {Proceedings of the 44th annual IEEE/ACM International Symposium on Microarchitecture}, pages = {248--259}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{mars2011directly, title = {Directly characterizing cross core interference through contention synthesis}, author = {Jason Mars and Lingjia Tang and Mary Lou Soffa}, year = {2011}, date = {2011-01-01}, booktitle = {Proceedings of the 6th International Conference on High Performance and Embedded Architectures and Compilers}, pages = {167--176}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{tang2011contentiousness, title = {Contentiousness vs. sensitivity: improving contention aware runtime systems on multicore architectures}, author = {Lingjia Tang and Jason Mars and Mary Lou Soffa}, year = {2011}, date = {2011-01-01}, booktitle = {Proceedings of the 1st International Workshop on Adaptive Self-Tuning Computing Systems for the Exaflop Era}, pages = {12--21}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{mars2011heterogeneity, title = {Heterogeneity in “homogeneous” warehouse-scale computers: A performance opportunity}, author = {Jason Mars and Lingjia Tang and Robert Hundt}, year = {2011}, date = {2011-01-01}, journal = {IEEE Computer Architecture Letters}, volume = {10}, number = {2}, pages = {29--32}, publisher = {IEEE}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{soffa2011exploiting, title = {Exploiting hardware advances for software testing and debugging (nier track)}, author = {Mary Lou Soffa and Kristen R Walcott and Jason Mars}, year = {2011}, date = {2011-01-01}, booktitle = {Proceedings of the 33rd International Conference on Software Engineering}, pages = {888--891}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{hiser2011evaluating, title = {Evaluating indirect branch handling mechanisms in software dynamic translation systems}, author = {Jason D Hiser and Daniel W Williams and Wei Hu and Jack W Davidson and Jason Mars and Bruce R Childers}, year = {2011}, date = {2011-01-01}, journal = {ACM Transactions on Architecture and Code Optimization (TACO)}, volume = {8}, number = {2}, pages = {1--28}, publisher = {ACM New York, NY, USA}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{mars2011loaf, title = {Loaf: a framework and infrastructure for creating online adaptive solutions}, author = {Jason Mars and Mary Lou Soffa}, year = {2011}, date = {2011-01-01}, booktitle = {Proceedings of the 1st International Workshop on Adaptive Self-Tuning Computing Systems for the Exaflop Era}, pages = {52--63}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{becker20112011, title = {2011 Index IEEE Computer Architecture Letters Vol. 10}, author = {Daniel Becker and Inseok Choi and Elliott Cooper-Balis and William Dally and Srinivas Devadas and Jose Duato and Jose Flich and Chen Fu and Jean-Luc Gaudiot and Georgi Gaydadjiev and others}, year = {2011}, date = {2011-01-01}, journal = {Computer}, volume = {53}, pages = {56}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @article{mars2011computer, title = {COMPUTER ARCHITECTURE LETTERS}, author = {J Mars and L Tang and R Hundt and G Michelogiannakis and N Jiang and DU Becker and WJ Dally and CH Ho and G Staus and A Ulmer and others}, year = {2011}, date = {2011-01-01}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @inproceedings{mars2010contention, title = {Contention aware execution: online contention detection and response}, author = {Jason Mars and Neil Vachharajani and Robert Hundt and Mary Lou Soffa}, year = {2010}, date = {2010-01-01}, booktitle = {Proceedings of the 8th annual IEEE/ACM international symposium on Code generation and optimization}, pages = {257--265}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{mars2009scenario, title = {Scenario based optimization: A framework for statically enabling online optimizations}, author = {Jason Mars and Robert Hundt}, year = {2009}, date = {2009-01-01}, booktitle = {2009 International Symposium on Code Generation and Optimization}, pages = {169--179}, organization = {IEEE}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{mars2009synthesizing, title = {Synthesizing contention}, author = {Jason Mars and Mary Lou Soffa}, year = {2009}, date = {2009-01-01}, booktitle = {Proceedings of the Workshop on Binary Instrumentation and Applications}, pages = {17--25}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{williams2009cross, title = {A cross-layer approach to heterogeneity and reliability}, author = {Daniel Williams and Aprotim Sanyal and Dan Upton and Jason Mars and Sudeep Ghosh and Kim Hazelwood}, year = {2009}, date = {2009-01-01}, booktitle = {2009 7th IEEE/ACM International Conference on Formal Methods and Models for Co-Design}, pages = {88--97}, organization = {IEEE}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @article{mars2009analysis, title = {An Analysis of Mac OS X Leopard}, author = {Jason Mars and Robert Dickerson}, year = {2009}, date = {2009-01-01}, journal = {h ttp://www. cs. virginia. edu/~ jom5x/papers/macos. pdf, last on November}, volume = {8}, keywords = {Pub}, pubstate = {published}, tppubtype = {article} } @phdthesis{mars2009online, title = {Online Adaptation for Application Performance and Efficiency}, author = {Jason Mars}, year = {2009}, date = {2009-01-01}, school = {University of Virginia}, keywords = {Pub}, pubstate = {published}, tppubtype = {phdthesis} } @inproceedings{mars2008mats, title = {Mats: Multicore adaptive trace selection}, author = {Jason Mars and Mary Lou Soffa}, year = {2008}, date = {2008-01-01}, booktitle = {Workshop on Software Tools for MultiCore Systems (STMCS)}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{mars2008reactive, title = {A reactive unobtrusive prefetcher for multicore and manycore architectures}, author = {Jason Mars and Daniel Williams and Dan Upton and Sudeep Ghosh and Kim Hazelwood}, year = {2008}, date = {2008-01-01}, booktitle = {Proceedings of the Workshop on Software and Hardware Challenges of Manycore Platforms (SHCMP)}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{hiser2007evaluating, title = {Evaluating indirect branch handling mechanisms in software dynamic translation systems}, author = {Jason D Hiser and Daniel Williams and Wei Hu and Jack W Davidson and Jason Mars and Bruce R Childers}, year = {2007}, date = {2007-01-01}, booktitle = {International Symposium on Code Generation and Optimization (CGO'07)}, pages = {61--73}, organization = {IEEE}, keywords = {Pub}, pubstate = {published}, tppubtype = {inproceedings} } @misc{mars2020systemsb, title = {Systems and methods for intelligently configuring and deploying a machine learning-based dialogue system}, author = {Jason Mars and Lingjia Tang and Michael A Laurenzano and Johann Hauswald and Parker Hill and Yiping Kang and Yunqi Zhang}, note = {US Patent 10,740,371}, keywords = {Patent}, pubstate = {published}, tppubtype = {misc} }