2023
Christopher Clarke; Yuzhao Heng; Yiping Kang; Krisztian Flautner; Lingjia Tang; Jason Mars
Label Agnostic Pre-training for Zero-shot Text Classification Proceedings Article
In: Findings of the Association for Computational Linguistics: ACL 2023, pp. 1009–1021, Association for Computational Linguistics, Toronto, Canada, 2023.
@inproceedings{clarke-etal-2023-label,
title = {Label Agnostic Pre-training for Zero-shot Text Classification},
author = {Christopher Clarke and Yuzhao Heng and Yiping Kang and Krisztian Flautner and Lingjia Tang and Jason Mars},
url = {https://aclanthology.org/2023.findings-acl.64},
doi = {10.18653/v1/2023.findings-acl.64},
year = {2023},
date = {2023-07-01},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2023},
pages = {1009–1021},
publisher = {Association for Computational Linguistics},
address = {Toronto, Canada},
abstract = {Conventional approaches to text classification typically assume the existence of a fixed set of predefined labels to which a given text can be classified. However, in real-world applications, there exists an infinite label space for describing a given text. In addition, depending on the aspect (sentiment, topic, etc.) and domain of the text (finance, legal, etc.), the interpretation of the label can vary greatly. This makes the task of text classification, particularly in the zero-shot scenario, extremely challenging. In this paper, we investigate the task of zero-shot text classification with the aim of improving the ability of pre-trained language models (PLMs) to generalize to both seen and unseen data across varying aspects and domains. To solve this we introduce two new simple yet effective pre-training strategies, Implicit and Explicit pre-training. These methods inject aspect-level understanding into the model at train time with the goal of conditioning the model to build task-level understanding. To evaluate this, we construct and release UTCD, a new benchmark dataset for evaluating text classification in zero-shot settings. Experimental results on UTCD show that our approach achieves improved zero-shot generalization on a suite of challenging datasets across an array of zero-shot formalizations.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Christopher Clarke; Matthew Hall; Gaurav Mittal; Ye Yu; Sandra Sajeev; Jason Mars; Mei Chen
Rule By Example: Harnessing Logical Rules for Explainable Hate Speech Detection Proceedings Article
In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 364–376, Association for Computational Linguistics, Toronto, Canada, 2023.
@inproceedings{clarke-etal-2023-rule,
title = {Rule By Example: Harnessing Logical Rules for Explainable Hate Speech Detection},
author = {Christopher Clarke and Matthew Hall and Gaurav Mittal and Ye Yu and Sandra Sajeev and Jason Mars and Mei Chen},
url = {https://aclanthology.org/2023.acl-long.22},
doi = {10.18653/v1/2023.acl-long.22},
year = {2023},
date = {2023-07-01},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
pages = {364–376},
publisher = {Association for Computational Linguistics},
address = {Toronto, Canada},
abstract = {Classic approaches to content moderation typically apply a rule-based heuristic approach to flag content. While rules are easily customizable and intuitive for humans to interpret, they are inherently fragile and lack the flexibility or robustness needed to moderate the vast amount of undesirable content found online today. Recent advances in deep learning have demonstrated the promise of using highly effective deep neural models to overcome these challenges. However, despite the improved performance, these data-driven models lack transparency and explainability, often leading to mistrust from everyday users and a lack of adoption by many platforms. In this paper, we present Rule By Example (RBE): a novel exemplar-based contrastive learning approach for learning from logical rules for the task of textual content moderation. RBE is capable of providing rule-grounded predictions, allowing for more explainable and customizable predictions compared to typical deep learning-based approaches. We demonstrate that our approach is capable of learning rich rule embedding representations using only a few data examples. Experimental results on 3 popular hate speech classification datasets show that RBE is able to outperform state-of-the-art deep learning classifiers as well as the use of rules in both supervised and unsupervised settings while providing explainable model predictions via rule-grounding.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jason Mars; Yiping Kang; Roland Daynauth; Baichuan Li; Ashish Mahendra; Krisztian Flautner; Lingjia Tang
The Jaseci Programming Paradigm and Runtime Stack: Building Scale-Out Production Applications Easy and Fast Journal Article
In: IEEE Computer Architecture Letters, vol. 22, no. 2, pp. 101-104, 2023.
@article{10129141,
title = {The Jaseci Programming Paradigm and Runtime Stack: Building Scale-Out Production Applications Easy and Fast},
author = {Jason Mars and Yiping Kang and Roland Daynauth and Baichuan Li and Ashish Mahendra and Krisztian Flautner and Lingjia Tang},
doi = {10.1109/LCA.2023.3274038},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {IEEE Computer Architecture Letters},
volume = {22},
number = {2},
pages = {101-104},
abstract = {Today's production scale-out applications include many sub-application components, such as storage backends, logging infrastructure and AI models. These components have drastically different characteristics, are required to work in collaboration, and interface with each other as microservices. This leads to increasingly high complexity in developing, optimizing, configuring, and deploying scale-out applications, raising the barrier to entry for most individuals and small teams. We developed a novel co-designed runtime system, Jaseci , and programming language, Jac , which aims to reduce this complexity. The key design principle throughout Jaseci's design is to raise the level of abstraction by moving as much of the scale-out data management, microservice componentization, and live update complexity into the runtime stack to be automated and optimized automatically. We use real-world AI applications to demonstrate Jaseci's benefit for application performance and developer productivity.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2022
Christopher Clarke; Joseph Peper; Karthik Krishnamurthy; Walter Talamonti; Kevin Leach; Walter Lasecki; Yiping Kang; Lingjia Tang; Jason Mars
Öne Agent To Rule Them All: Towards Multi-agent Conversational AI Proceedings Article
In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 3258–3267, Ässociation for Computational Linguistics, Dublin, Ireland, 2022.
@inproceedings{clarke-etal-2022-one,
title = {Öne Agent To Rule Them All: Towards Multi-agent Conversational AI},
author = {Christopher Clarke and Joseph Peper and Karthik Krishnamurthy and Walter Talamonti and Kevin Leach and Walter Lasecki and Yiping Kang and Lingjia Tang and Jason Mars},
url = {https://aclanthology.org/2022.findings-acl.257},
doi = {10.18653/v1/2022.findings-acl.257},
year = {2022},
date = {2022-05-01},
urldate = {2022-05-01},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
pages = {3258--3267},
publisher = {Ässociation for Computational Linguistics},
address = {Dublin, Ireland},
abstract = {The increasing volume of commercially available conversational agents (CAs) on the market has resulted in users being burdened with learning and adopting multiple agents to accomplish their tasks. Though prior work has explored supporting a multitude of domains within the design of a single agent, the interaction experience suffers due to the large action space of desired capabilities. To address these problems, we introduce a new task BBAI: Black-Box Agent Integration, focusing on combining the capabilities of multiple black-box CAs at scale. We explore two techniques: question agent pairing and question response pairing aimed at resolving this task. Leveraging these techniques, we design One For All (OFA), a scalable system that provides a unified interface to interact with multiple CAs. Additionally, we introduce MARS: Multi-Agent Response Selection, a new encoder model for question response pairing that jointly encodes user question and agent response pairs. We demonstrate that OFA is able to automatically and accurately integrate an ensemble of commercially available CAs spanning disparate domains. Specifically, using the MARS encoder we achieve the highest accuracy on our BBAI task, outperforming strong baselines.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Tianyi Liu; Sen He; Sunzhou Huang; Danny Tsang; Lingjia Tang; Jason Mars; Wei Wang
A Benchmarking Framework for Interactive 3D Applications in the Cloud Proceedings Article
In: 2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO), pp. 881–894, IEEE 2020.
@inproceedings{liu2020benchmarking,
title = {A Benchmarking Framework for Interactive 3D Applications in the Cloud},
author = {Tianyi Liu and Sen He and Sunzhou Huang and Danny Tsang and Lingjia Tang and Jason Mars and Wei Wang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/2006.13378.pdf},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {881--894},
organization = {IEEE},
abstract = {With the growing popularity of cloud gaming and cloud virtual reality (VR), interactive 3D applications have become a major class of workloads for the cloud. However, despite their growing importance, there is limited public research on how to design cloud systems to efficiently support these applications due to the lack of an open and reliable research infrastructure, including benchmarks and performance analysis tools. The challenges of generating human-like inputs under various system/application nondeterminism and dissecting the performance of complex graphics systems make it very difficult to design such an infrastructure. In this paper, we present the design of a novel research infrastructure, Pictor, for cloud 3D applications and systems. Pictor employs AI to mimic human interactions with complex 3D applications. It can also track the processing of user inputs to provide in-depth performance measurements for the complex software and hardware stack used for cloud 3D-graphics rendering. With Pictor, we designed a benchmark suite with six interactive 3D applications. Performance analyses were conducted with these benchmarks, which show that cloud system designs, including both system software and hardware designs, are crucial to the performance of cloud 3D applications. The analyses also show that energy consumption can be reduced by at least 37% when two 3D applications share a could server. To demonstrate the effectiveness of Pictor, we also implemented two optimizations to address two performance bottlenecks discovered in a state-of-the-art cloud 3D-graphics rendering system. These two optimizations improved the frame rate by 57.7% on average.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2019
Ram Srivatsa Kannan; Lavanya Subramanian; Ashwin Raju; Jeongseob Ahn; Jason Mars; Lingjia Tang
Grandslam: Guaranteeing slas for jobs in microservices execution frameworks Proceedings Article
In: Proceedings of the Fourteenth EuroSys Conference 2019, pp. 1–16, 2019.
@inproceedings{kannan2019grandslam,
title = {Grandslam: Guaranteeing slas for jobs in microservices execution frameworks},
author = {Ram Srivatsa Kannan and Lavanya Subramanian and Ashwin Raju and Jeongseob Ahn and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3302424.3303958.pdf},
year = {2019},
date = {2019-01-01},
booktitle = {Proceedings of the Fourteenth EuroSys Conference 2019},
pages = {1--16},
abstract = {The microservice architecture has dramatically reduced user effort in adopting and maintaining servers by providing a catalog of functions as services that can be used as building blocks to construct applications. This has enabled datacenter operators to look at managing datacenter hosting microservices quite differently from traditional infrastructures. Such a paradigm shift calls for a need to rethink resource management strategies employed in such execution environments. We observe that the visibility enabled by a microservices execution framework can be exploited to achieve high throughput and resource utilization while still meeting Service Level Agreements, especially in multi-tenant execution scenarios.
In this study, we present GrandSLAm, a microservice execution framework that improves utilization of datacenters hosting microservices. GrandSLAm estimates time of completion of requests propagating through individual microservice stages within an application. It then leverages this estimate to drive a runtime system that dynamically batches and reorders requests at each microservice in a manner where individual jobs meet their respective target latency while achieving high throughput. GrandSLAm significantly increases throughput by up to 3x compared to the our baseline, without violating SLAs for a wide range of real-world AI and ML applications.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this study, we present GrandSLAm, a microservice execution framework that improves utilization of datacenters hosting microservices. GrandSLAm estimates time of completion of requests propagating through individual microservice stages within an application. It then leverages this estimate to drive a runtime system that dynamically batches and reorders requests at each microservice in a manner where individual jobs meet their respective target latency while achieving high throughput. GrandSLAm significantly increases throughput by up to 3x compared to the our baseline, without violating SLAs for a wide range of real-world AI and ML applications.
Manish Arora; Matt Skach; Wei Huang; Xudong An; Jason Mars; Lingjia Tang; Dean M Tullsen
Understanding the Impact of Socket Density in Density Optimized Servers Proceedings Article
In: 2019 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp. 687–700, IEEE 2019.
@inproceedings{arora2019understanding,
title = {Understanding the Impact of Socket Density in Density Optimized Servers},
author = {Manish Arora and Matt Skach and Wei Huang and Xudong An and Jason Mars and Lingjia Tang and Dean M Tullsen},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08675196.pdf},
year = {2019},
date = {2019-01-01},
booktitle = {2019 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
pages = {687--700},
organization = {IEEE},
abstract = {The increasing demand for computational power has led to the creation and deployment of large-scale data centers. During the last few years, data centers have seen improvements aimed at increasing computational density - the amount of throughput that can be achieved within the allocated physical footprint. This need to pack more compute in the same physical space has led to density optimized server designs. Density optimized servers push compute density significantly beyond what can be achieved by blade servers by using innovative modular chassis based designs. This paper presents a comprehensive analysis of the impact of socket density on intra-server thermals and demonstrates that increased socket density inside the server leads to large temperature variations among sockets due to inter-socket thermal coupling. The paper shows that traditional chip-level and data center-level temperature-aware scheduling techniques do not work well for thermally-coupled sockets. The paper proposes new scheduling techniques that account for the thermals of the socket a task is scheduled on, as well as thermally coupled nearby sockets. The proposed mechanisms provide 2.5% to 6.5% performance improvements across various workloads and as much as 17% over traditional temperature-aware schedulers for computation-heavy workloads.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Stefan Larson; Anish Mahendran; Andrew Lee; Jonathan K Kummerfeld; Parker Hill; Michael A Laurenzano; Johann Hauswald; Lingjia Tang; Jason Mars
Outlier Detection for Improved Data Quality and Diversity in Dialog Systems Journal Article
In: Proceedings of NAACL-HLT 2019, pp. 517–527, 2019.
@article{larson2019outlier,
title = {Outlier Detection for Improved Data Quality and Diversity in Dialog Systems},
author = {Stefan Larson and Anish Mahendran and Andrew Lee and Jonathan K Kummerfeld and Parker Hill and Michael A Laurenzano and Johann Hauswald and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/N19-1051.pdf},
year = {2019},
date = {2019-01-01},
journal = {Proceedings of NAACL-HLT 2019},
pages = {517–527},
abstract = {In a corpus of data, outliers are either errors: mistakes in the data that are counterproductive, or are unique: informative samples that improve model robustness. Identifying outliers can lead to better datasets by (1) removing noise in datasets and (2) guiding collection of additional data to fill gaps. However, the problem of detecting both outlier types has received relatively little attention in NLP, particularly for dialog systems. We introduce a simple and effective technique for detecting both erroneous and unique samples in a corpus of short texts using neural sentence embeddings combined with distance-based outlier detection. We also present a novel data collection pipeline built atop our detection technique to automatically and iteratively mine unique data samples while discarding erroneous samples. Experiments show that our outlier detection technique is effective at finding errors while our data collection pipeline yields highly diverse corpora that in turn produce more robust intent classification and slot-filling models.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Ram Srivatsa Kannan; Michael Laurenzano; Jeongseob Ahn; Jason Mars; Lingjia Tang
Caliper: Interference estimator for multi-tenant environments sharing architectural resources Journal Article
In: ACM Transactions on Architecture and Code Optimization (TACO), vol. 16, no. 3, pp. 1–25, 2019.
@article{kannan2019caliper,
title = {Caliper: Interference estimator for multi-tenant environments sharing architectural resources},
author = {Ram Srivatsa Kannan and Michael Laurenzano and Jeongseob Ahn and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3323090.pdf},
year = {2019},
date = {2019-01-01},
journal = {ACM Transactions on Architecture and Code Optimization (TACO)},
volume = {16},
number = {3},
pages = {1--25},
publisher = {ACM New York, NY, USA},
abstract = {We introduce Caliper, a technique for accurately estimating performance interference occurring in shared servers. Caliper overcomes the limitations of prior approaches by leveraging a micro-experiment-based technique. In contrast to state-of-the-art approaches that focus on periodically pausing co-running applications to estimate slowdown, Caliper utilizes a strategic phase-triggered technique to capture interference due to co-location. This enables Caliper to orchestrate an accurate and low-overhead interference estimation technique that can be readily deployed in existing production systems. We evaluate Caliper for a broad spectrum of workload scenarios, demonstrating its ability to seamlessly support up to 16 applications running simultaneously and outperform the state-of-the-art approaches.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Stefan Larson; Anish Mahendran; Joseph J Peper; Christopher Clarke; Andrew Lee; Parker Hill; Jonathan K Kummerfeld; Kevin Leach; Michael A Laurenzano; Lingjia Tang; Jason Mars
An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction Journal Article
In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing, pp. 1311–1316, 2019.
@article{larson2019evaluation,
title = {An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction},
author = {Stefan Larson and Anish Mahendran and Joseph J Peper and Christopher Clarke and Andrew Lee and Parker Hill and Jonathan K Kummerfeld and Kevin Leach and Michael A Laurenzano and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/D19-1131.pdf},
year = {2019},
date = {2019-01-01},
journal = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
pages = {1311–1316},
abstract = {Task-oriented dialog systems need to know when a query falls outside their range of supported intents, but current text classification corpora only define label sets that cover every example. We introduce a new dataset that includes queries that are out-of-scope---i.e., queries that do not fall into any of the system's supported intents. This poses a new challenge because models cannot assume that every query at inference time belongs to a system-supported intent class. Our dataset also covers 150 intent classes over 10 domains, capturing the breadth that a production task-oriented agent must handle. We evaluate a range of benchmark classifiers on our dataset along with several different out-of-scope identification schemes. We find that while the classifiers perform well on in-scope intent classification, they struggle to identify out-of-scope queries. Our dataset and evaluation fill an important gap in the field, offering a way of more rigorously and realistically benchmarking text classification in task-driven dialog systems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2018
Parker Hill; Babak Zamirai; Shengshuo Lu; Yu-Wei Chao; Michael Laurenzano; Mehrzad Samadi; Marios Papaefthymiou; Scott Mahlke; Thomas Wenisch; Jia Deng; Lingjia Tang; Jason Mars
Rethinking numerical representations for deep neural networks Journal Article
In: arXiv preprint arXiv:1808.02513, 2018.
@article{hill2018rethinking,
title = {Rethinking numerical representations for deep neural networks},
author = {Parker Hill and Babak Zamirai and Shengshuo Lu and Yu-Wei Chao and Michael Laurenzano and Mehrzad Samadi and Marios Papaefthymiou and Scott Mahlke and Thomas Wenisch and Jia Deng and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/1808.02513.pdf},
year = {2018},
date = {2018-01-01},
journal = {arXiv preprint arXiv:1808.02513},
abstract = {With ever-increasing computational demand for deep learning, it is critical to investigate the implications of the numeric representation and precision of DNN model weights and activations on computational efficiency. In this work, we explore unconventional narrow-precision floating-point representations as it relates to inference accuracy and efficiency to steer the improved design of future DNN platforms. We show that inference using these custom numeric representations on production-grade DNNs, including GoogLeNet and VGG, achieves an average speedup of 7.6x with less than 1% degradation in inference accuracy relative to a state-of-the-art baseline platform representing the most sophisticated hardware using single-precision floating point. To facilitate the use of such customized precision, we also present a novel technique that drastically reduces the time required to derive the optimal precision configuration. },
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Shih-Chieh Lin; Yunqi Zhang; Chang-Hong Hsu; Matt Skach; Md E Haque; Lingjia Tang; Jason Mars
The architectural implications of autonomous driving: Constraints and acceleration Proceedings Article
In: Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 751–766, 2018.
@inproceedings{lin2018architectural,
title = {The architectural implications of autonomous driving: Constraints and acceleration},
author = {Shih-Chieh Lin and Yunqi Zhang and Chang-Hong Hsu and Matt Skach and Md E Haque and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/AutonomousCar-ASPLOS18.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {751--766},
abstract = {Autonomous driving systems have attracted a significant amount of interest recently, and many industry leaders, such as Google, Uber, Tesla, and Mobileye, have invested a large amount of capital and engineering power on developing such systems. Building autonomous driving systems is particularly challenging due to stringent performance requirements in terms of both making the safe operational decisions and finishing processing at real-time. Despite the recent advancements in technology, such systems are still largely under experimentation and architecting end-to-end autonomous driving systems remains an open research question. To investigate this question, we first present and formalize the design constraints for building an autonomous driving system in terms of performance, predictability, storage, thermal and power. We then build an end-to-end autonomous driving system using state-of-the-art award-winning algorithms to understand the design trade-offs for building such systems. In our real-system characterization, we identify three computational bottlenecks, which conventional multicore CPUs are incapable of processing under the identified design constraints. To meet these constraints, we accelerate these algorithms using three accelerator platforms including GPUs, FPGAs, and ASICs, which can reduce the tail latency of the system by 169x, 10x, and 93x respectively. With accelerator-based designs, we are able to build an end-to-end autonomous driving system that meets all the design constraints, and explore the trade-offs among performance, power and the higher accuracy enabled by higher resolution cameras.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chang-Hong Hsu; Qingyuan Deng; Jason Mars; Lingjia Tang
Smoothoperator: Reducing power fragmentation and improving power utilization in large-scale datacenters Proceedings Article
In: Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 535–548, 2018.
@inproceedings{hsu2018smoothoperator,
title = {Smoothoperator: Reducing power fragmentation and improving power utilization in large-scale datacenters},
author = {Chang-Hong Hsu and Qingyuan Deng and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/smooth_operator.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {535--548},
abstract = {With the ever growing popularity of cloud computing and web services, Internet companies are in need of increased computing capacity to serve the demand. However, power has become a major limiting factor prohibiting the growth in industry: it is often the case that no more servers can be added to datacenters without surpassing the capacity of the existing power infrastructure.
In this work, we first investigate the power utilization in Facebook datacenters. We observe that the combination of provisioning for peak power usage, highly fluctuating traffic, and multi-level power delivery infrastructure leads to significant power budget fragmentation problem and inefficiently low power utilization. To address this issue, our insight is that heterogeneity of power consumption patterns among different services provides opportunities to re-shape the power profile of each power node by re-distributing services. By grouping services with asynchronous peak times under the same power node, we can reduce the peak power of each node and thus creating more power head-rooms to allow more servers hosted, achieving higher throughput. Based on this insight, we develop a workload-aware service placement framework to systematically spread the service instances with synchronous power patterns evenly under the power supply tree, greatly reducing the peak power draw at power nodes. We then leverage dynamic power profile reshaping to maximally utilize the headroom unlocked by our placement framework. Our experiments based on real production workload and power traces show that we are able to host up to 13% more machines in production, without changing the underlying power infrastructure. Utilizing the unleashed power headroom with dynamic reshaping, we achieve up to an estimated total of 15% and 11% throughput improvement for latency-critical service and batch service respectively at the same time, with up to 44% of energy slack reduction.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work, we first investigate the power utilization in Facebook datacenters. We observe that the combination of provisioning for peak power usage, highly fluctuating traffic, and multi-level power delivery infrastructure leads to significant power budget fragmentation problem and inefficiently low power utilization. To address this issue, our insight is that heterogeneity of power consumption patterns among different services provides opportunities to re-shape the power profile of each power node by re-distributing services. By grouping services with asynchronous peak times under the same power node, we can reduce the peak power of each node and thus creating more power head-rooms to allow more servers hosted, achieving higher throughput. Based on this insight, we develop a workload-aware service placement framework to systematically spread the service instances with synchronous power patterns evenly under the power supply tree, greatly reducing the peak power draw at power nodes. We then leverage dynamic power profile reshaping to maximally utilize the headroom unlocked by our placement framework. Our experiments based on real production workload and power traces show that we are able to host up to 13% more machines in production, without changing the underlying power infrastructure. Utilizing the unleashed power headroom with dynamic reshaping, we achieve up to an estimated total of 15% and 11% throughput improvement for latency-critical service and batch service respectively at the same time, with up to 44% of energy slack reduction.
Animesh Jain; Amar Phanishayee; Jason Mars; Lingjia Tang; Gennady Pekhimenko
Gist: Efficient data encoding for deep neural network training Proceedings Article
In: 2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA), pp. 776–789, IEEE 2018.
@inproceedings{jain2018gist,
title = {Gist: Efficient data encoding for deep neural network training},
author = {Animesh Jain and Amar Phanishayee and Jason Mars and Lingjia Tang and Gennady Pekhimenko},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08416872.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)},
pages = {776--789},
organization = {IEEE},
abstract = {Modern deep neural networks (DNNs) training typically relies on GPUs to train complex hundred-layer deep networks. A significant problem facing both researchers and industry practitioners is that, as the networks get deeper, the available GPU main memory becomes a primary bottleneck, limiting the size of networks it can train. In this paper, we investigate widely used DNNs and find that the major contributors to memory footprint are intermediate layer outputs (feature maps). We then introduce a framework for DNN-layer-specific optimizations (e.g., convolution, ReLU, pool) that significantly reduce this source of main memory pressure on GPUs. We find that a feature map typically has two uses that are spread far apart temporally. Our key approach is to store an encoded representation of feature maps for this temporal gap and decode this data for use in the backward pass; the full-fidelity feature maps are used in the forward pass and relinquished immediately. Based on this approach, we present Gist, our system that employs two classes of layer-specific encoding schemes - lossless and lossy - to exploit existing value redundancy in DNN training to significantly reduce the memory consumption of targeted feature maps. For example, one insight is by taking advantage of the computational nature of back propagation from pool to ReLU layer, we can store the intermediate feature map using just 1 bit instead of 32 bits per value. We deploy these mechanisms in a state-of-the-art DNN framework (CNTK) and observe that Gist reduces the memory footprint to upto 2× across 5 state-of-the-art image classification DNNs, with an average of 1.8× with only 4% performance overhead. We also show that further software (e.g., CuDNN) and hardware (e.g., dynamic allocation) optimizations can result in even larger footprint reduction (upto 4.1×).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Yiping Kang; Yunqi Zhang; Jonathan K Kummerfeld; Lingjia Tang; Jason Mars
Data collection for dialogue system: A startup perspective Proceedings Article
In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers), pp. 33–40, 2018.
@inproceedings{kang2018data,
title = {Data collection for dialogue system: A startup perspective},
author = {Yiping Kang and Yunqi Zhang and Jonathan K Kummerfeld and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/N18-3005.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers)},
pages = {33--40},
abstract = {Industrial dialogue systems such as Apple Siri and Google Now rely on large scale diverse and robust training data to enable their sophisticated conversation capability. Crowdsourcing provides a scalable and inexpensive way of data collection but collecting high quality data efficiently requires thoughtful orchestration of the crowdsourcing jobs. Prior study of this topic have focused on tasks only in the academia settings with limited scope or only provide intrinsic dataset analysis, lacking indication on how it affects the trained model performance. In this paper, we present a study of crowdsourcing methods for a user intent classification task in our deployed dialogue system. Our task requires classification of 47 possible user intents and contains many intent pairs with subtle differences. We consider different crowdsourcing job types and job prompts and analyze quantitatively the quality of the collected data and the downstream model performance on a test set of real user queries from production logs. Our observation provides insights into designing efficient crowdsourcing jobs and provide recommendations for future dialogue system data collection process.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Ram Srivatsa Kannan; Animesh Jain; Michael A Laurenzano; Lingjia Tang; Jason Mars
Proctor: Detecting and investigating interference in shared datacenters Proceedings Article
In: 2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 76–86, IEEE 2018.
@inproceedings{kannan2018proctor,
title = {Proctor: Detecting and investigating interference in shared datacenters},
author = {Ram Srivatsa Kannan and Animesh Jain and Michael A Laurenzano and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08366937.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
pages = {76--86},
organization = {IEEE},
abstract = {Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance intrusion and their root causes.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Matt Skach; Manish Arora; Dean Tullsen; Lingjia Tang; Jason Mars
Virtual melting temperature: managing server load to minimize cooling overhead with phase change materials Proceedings Article
In: 2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA), pp. 15–28, IEEE 2018.
@inproceedings{skach2018virtual,
title = {Virtual melting temperature: managing server load to minimize cooling overhead with phase change materials},
author = {Matt Skach and Manish Arora and Dean Tullsen and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08416815.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)},
pages = {15--28},
organization = {IEEE},
abstract = {As the power density and power consumption of large scale datacenters continue to grow, the challenges of removing heat from these datacenters and keeping them cool is an increasingly urgent and costly. With the largest datacenters now exceeding over 200 MW of power, the cooling systems that prevent overheating cost on the order of tens of millions of dollars. Prior work proposed to deploy phase change materials (PCM) and use Thermal Time Shifting (TTS) to reshape the thermal load of a datacenter by storing heat during peak hours of high utilization and releasing it during off hours when utilization is low, enabling a smaller cooling system to handle the same peak load. The peak cooling load reduction enabled by TTS is greatly beneficial, however TTS is a passive system that cannot handle many workload mixtures or adapt to changing load or environmental characteristics. In this work we propose VMT, a thermal aware job placement technique that adds an active, tunable component to enable greater control over datacenter thermal output. We propose two different job placement algorithms for VMT and perform a scale out study of VMT in a simulated server cluster. We provide analysis of the use cases and trade-offs of each algorithm, and show that VMT reduces peak cooling load by up to 12.8% to provide over two million dollars in cost savings when a smaller cooling system is installed, or allows for over 7,000 additional servers to be added in scenarios where TTS is ineffective.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Shih-Chieh Lin; Chang-Hong Hsu; Walter Talamonti; Yunqi Zhang; Steve Oney; Jason Mars; Lingjia Tang
Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features Proceedings Article
In: Proceedings of the 31st Annual ACM Symposium on User Interface Software and Technology, pp. 531–542, 2018.
@inproceedings{lin2018adasa,
title = {Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features},
author = {Shih-Chieh Lin and Chang-Hong Hsu and Walter Talamonti and Yunqi Zhang and Steve Oney and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/lin2018adasa.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 31st Annual ACM Symposium on User Interface Software and Technology},
pages = {531--542},
abstract = {Advanced Driver Assistance Systems (ADAS) come equipped on most modern vehicles and are intended to assist the driver and enhance the driving experience through features such as lane keeping system and adaptive cruise control. However, recent studies show that few people utilize these features for several reasons. First, ADAS features were not common until recently. Second, most users are unfamiliar with these features and do not know what to expect. Finally, the interface for operating these features is not intuitive. To help drivers understand ADAS features, we present a conversational in-vehicle digital assistant that responds to drivers' questions and commands in natural language. With the system prototyped herein, drivers can ask questions or command using unconstrained natural language in the vehicle, and the assistant trained by using advanced machine learning techniques, coupled with access to vehicle signals, responds in real-time based on conversational context. Results of our system prototyped on a production vehicle are presented, demonstrating its effectiveness in improving driver understanding and usability of ADAS.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Animesh Jain; Michael A Laurenzano; Gilles A Pokam; Jason Mars; Lingjia Tang
Architectural support for convolutional neural networks on modern CPUs Proceedings Article
In: Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques, pp. 1–13, 2018.
@inproceedings{jain2018architectural,
title = {Architectural support for convolutional neural networks on modern CPUs},
author = {Animesh Jain and Michael A Laurenzano and Gilles A Pokam and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3243176.3243177.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
pages = {1--13},
abstract = {A key focus of recent work in our community has been on devising increasingly sophisticated acceleration devices for deep neural network (DNN) computation, especially for networks driven by convolution layers. Yet, despite the promise of substantial improvements in performance and energy consumption offered by these approaches, general purpose computing is not going away because its traditional well-understood programming model and continued wide deployment. Therefore, the question arises as to what can be done, if anything, to evolve conventional CPUs to accommodate efficient deep neural network computation.
This work focuses on the challenging problem of identifying and alleviating the performance bottlenecks for convolution layer computation for conventional CPU platforms. We begin by performing a detailed study of a range of CNN-based applications on a modern CPU microarchitecture, finding that designing a physical register file (PRF) capable of feeding computational units is the primary barrier that prevents the addition of more compute units in the CPU, limiting the performance improvements that can be achieved by CPU on convolution layers. We present the design of a novel, minimally intrusive set of microarchitectural and ISA extensions that address this problem and describe the code generation support needed to take advantage our design. Through a detailed evaluation that covers 5 state-of-the-art neural network applications, we observe that applying these extensions allows packing more compute in the CPU while keeping PRF energy in check, achieving a 2× performance improvement and a 2.7× energy-delay product improvement against a popular Intel Haswell server processor baseline.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
This work focuses on the challenging problem of identifying and alleviating the performance bottlenecks for convolution layer computation for conventional CPU platforms. We begin by performing a detailed study of a range of CNN-based applications on a modern CPU microarchitecture, finding that designing a physical register file (PRF) capable of feeding computational units is the primary barrier that prevents the addition of more compute units in the CPU, limiting the performance improvements that can be achieved by CPU on convolution layers. We present the design of a novel, minimally intrusive set of microarchitectural and ISA extensions that address this problem and describe the code generation support needed to take advantage our design. Through a detailed evaluation that covers 5 state-of-the-art neural network applications, we observe that applying these extensions allows packing more compute in the CPU while keeping PRF energy in check, achieving a 2× performance improvement and a 2.7× energy-delay product improvement against a popular Intel Haswell server processor baseline.
2017
Chang-Hong Hsu; Yunqi Zhang; Michael A Laurenzano; David Meisner; Thomas Wenisch; Ronald G Dreslinski; Jason Mars; Lingjia Tang
Reining in long tails in warehouse-scale computers with quick voltage boosting using adrenaline Journal Article
In: ACM Transactions on Computer Systems (TOCS), vol. 35, no. 1, pp. 1–33, 2017.
@article{hsu2017reining,
title = {Reining in long tails in warehouse-scale computers with quick voltage boosting using adrenaline},
author = {Chang-Hong Hsu and Yunqi Zhang and Michael A Laurenzano and David Meisner and Thomas Wenisch and Ronald G Dreslinski and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3054742.pdf},
year = {2017},
date = {2017-01-01},
journal = {ACM Transactions on Computer Systems (TOCS)},
volume = {35},
number = {1},
pages = {1--33},
publisher = {ACM New York, NY, USA},
abstract = {Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service (QoS) of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor’s voltage and frequency during a coarse-grained sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer-granularity (tens of nanoseconds) voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50 × tail latency improvement for Memcached and up to a 3.03 × for Web Search over coarse-grained dynamic voltage and frequency scaling (DVFS) given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81 × improvement for Memcached and up to a 1.99 × for Web Search over coarse-grained DVFS. By using the carefully chosen boost thresholds, Adrenaline further improves the tail latency reduction to 4.82 × over coarse-grained DVFS.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}