2022
Clarke, Christopher; Peper, Joseph; Krishnamurthy, Karthik; Talamonti, Walter; Leach, Kevin; Lasecki, Walter; Kang, Yiping; Tang, Lingjia; Mars, Jason
Öne Agent To Rule Them All: Towards Multi-agent Conversational AI Inproceedings
In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 3258–3267, Ässociation for Computational Linguistics, Dublin, Ireland, 2022.
@inproceedings{clarke-etal-2022-one,
title = {Öne Agent To Rule Them All: Towards Multi-agent Conversational AI},
author = {Christopher Clarke and Joseph Peper and Karthik Krishnamurthy and Walter Talamonti and Kevin Leach and Walter Lasecki and Yiping Kang and Lingjia Tang and Jason Mars},
url = {https://aclanthology.org/2022.findings-acl.257},
doi = {10.18653/v1/2022.findings-acl.257},
year = {2022},
date = {2022-05-01},
urldate = {2022-05-01},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
pages = {3258--3267},
publisher = {Ässociation for Computational Linguistics},
address = {Dublin, Ireland},
abstract = {The increasing volume of commercially available conversational agents (CAs) on the market has resulted in users being burdened with learning and adopting multiple agents to accomplish their tasks. Though prior work has explored supporting a multitude of domains within the design of a single agent, the interaction experience suffers due to the large action space of desired capabilities. To address these problems, we introduce a new task BBAI: Black-Box Agent Integration, focusing on combining the capabilities of multiple black-box CAs at scale. We explore two techniques: question agent pairing and question response pairing aimed at resolving this task. Leveraging these techniques, we design One For All (OFA), a scalable system that provides a unified interface to interact with multiple CAs. Additionally, we introduce MARS: Multi-Agent Response Selection, a new encoder model for question response pairing that jointly encodes user question and agent response pairs. We demonstrate that OFA is able to automatically and accurately integrate an ensemble of commercially available CAs spanning disparate domains. Specifically, using the MARS encoder we achieve the highest accuracy on our BBAI task, outperforming strong baselines.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Liu, Tianyi; He, Sen; Huang, Sunzhou; Tsang, Danny; Tang, Lingjia; Mars, Jason; Wang, Wei
A Benchmarking Framework for Interactive 3D Applications in the Cloud Inproceedings
In: 2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO), pp. 881–894, IEEE 2020.
@inproceedings{liu2020benchmarking,
title = {A Benchmarking Framework for Interactive 3D Applications in the Cloud},
author = {Tianyi Liu and Sen He and Sunzhou Huang and Danny Tsang and Lingjia Tang and Jason Mars and Wei Wang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/2006.13378.pdf},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {881--894},
organization = {IEEE},
abstract = {With the growing popularity of cloud gaming and cloud virtual reality (VR), interactive 3D applications have become a major class of workloads for the cloud. However, despite their growing importance, there is limited public research on how to design cloud systems to efficiently support these applications due to the lack of an open and reliable research infrastructure, including benchmarks and performance analysis tools. The challenges of generating human-like inputs under various system/application nondeterminism and dissecting the performance of complex graphics systems make it very difficult to design such an infrastructure. In this paper, we present the design of a novel research infrastructure, Pictor, for cloud 3D applications and systems. Pictor employs AI to mimic human interactions with complex 3D applications. It can also track the processing of user inputs to provide in-depth performance measurements for the complex software and hardware stack used for cloud 3D-graphics rendering. With Pictor, we designed a benchmark suite with six interactive 3D applications. Performance analyses were conducted with these benchmarks, which show that cloud system designs, including both system software and hardware designs, are crucial to the performance of cloud 3D applications. The analyses also show that energy consumption can be reduced by at least 37% when two 3D applications share a could server. To demonstrate the effectiveness of Pictor, we also implemented two optimizations to address two performance bottlenecks discovered in a state-of-the-art cloud 3D-graphics rendering system. These two optimizations improved the frame rate by 57.7% on average.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2019
Kannan, Ram Srivatsa; Subramanian, Lavanya; Raju, Ashwin; Ahn, Jeongseob; Mars, Jason; Tang, Lingjia
Grandslam: Guaranteeing slas for jobs in microservices execution frameworks Inproceedings
In: Proceedings of the Fourteenth EuroSys Conference 2019, pp. 1–16, 2019.
@inproceedings{kannan2019grandslam,
title = {Grandslam: Guaranteeing slas for jobs in microservices execution frameworks},
author = {Ram Srivatsa Kannan and Lavanya Subramanian and Ashwin Raju and Jeongseob Ahn and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3302424.3303958.pdf},
year = {2019},
date = {2019-01-01},
booktitle = {Proceedings of the Fourteenth EuroSys Conference 2019},
pages = {1--16},
abstract = {The microservice architecture has dramatically reduced user effort in adopting and maintaining servers by providing a catalog of functions as services that can be used as building blocks to construct applications. This has enabled datacenter operators to look at managing datacenter hosting microservices quite differently from traditional infrastructures. Such a paradigm shift calls for a need to rethink resource management strategies employed in such execution environments. We observe that the visibility enabled by a microservices execution framework can be exploited to achieve high throughput and resource utilization while still meeting Service Level Agreements, especially in multi-tenant execution scenarios.
In this study, we present GrandSLAm, a microservice execution framework that improves utilization of datacenters hosting microservices. GrandSLAm estimates time of completion of requests propagating through individual microservice stages within an application. It then leverages this estimate to drive a runtime system that dynamically batches and reorders requests at each microservice in a manner where individual jobs meet their respective target latency while achieving high throughput. GrandSLAm significantly increases throughput by up to 3x compared to the our baseline, without violating SLAs for a wide range of real-world AI and ML applications.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this study, we present GrandSLAm, a microservice execution framework that improves utilization of datacenters hosting microservices. GrandSLAm estimates time of completion of requests propagating through individual microservice stages within an application. It then leverages this estimate to drive a runtime system that dynamically batches and reorders requests at each microservice in a manner where individual jobs meet their respective target latency while achieving high throughput. GrandSLAm significantly increases throughput by up to 3x compared to the our baseline, without violating SLAs for a wide range of real-world AI and ML applications.
Arora, Manish; Skach, Matt; Huang, Wei; An, Xudong; Mars, Jason; Tang, Lingjia; Tullsen, Dean M
Understanding the Impact of Socket Density in Density Optimized Servers Inproceedings
In: 2019 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp. 687–700, IEEE 2019.
@inproceedings{arora2019understanding,
title = {Understanding the Impact of Socket Density in Density Optimized Servers},
author = {Manish Arora and Matt Skach and Wei Huang and Xudong An and Jason Mars and Lingjia Tang and Dean M Tullsen},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08675196.pdf},
year = {2019},
date = {2019-01-01},
booktitle = {2019 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
pages = {687--700},
organization = {IEEE},
abstract = {The increasing demand for computational power has led to the creation and deployment of large-scale data centers. During the last few years, data centers have seen improvements aimed at increasing computational density - the amount of throughput that can be achieved within the allocated physical footprint. This need to pack more compute in the same physical space has led to density optimized server designs. Density optimized servers push compute density significantly beyond what can be achieved by blade servers by using innovative modular chassis based designs. This paper presents a comprehensive analysis of the impact of socket density on intra-server thermals and demonstrates that increased socket density inside the server leads to large temperature variations among sockets due to inter-socket thermal coupling. The paper shows that traditional chip-level and data center-level temperature-aware scheduling techniques do not work well for thermally-coupled sockets. The paper proposes new scheduling techniques that account for the thermals of the socket a task is scheduled on, as well as thermally coupled nearby sockets. The proposed mechanisms provide 2.5% to 6.5% performance improvements across various workloads and as much as 17% over traditional temperature-aware schedulers for computation-heavy workloads.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Larson, Stefan; Mahendran, Anish; Lee, Andrew; Kummerfeld, Jonathan K; Hill, Parker; Laurenzano, Michael A; Hauswald, Johann; Tang, Lingjia; Mars, Jason
Outlier Detection for Improved Data Quality and Diversity in Dialog Systems Journal Article
In: Proceedings of NAACL-HLT 2019, pp. 517–527, 2019.
@article{larson2019outlier,
title = {Outlier Detection for Improved Data Quality and Diversity in Dialog Systems},
author = {Stefan Larson and Anish Mahendran and Andrew Lee and Jonathan K Kummerfeld and Parker Hill and Michael A Laurenzano and Johann Hauswald and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/N19-1051.pdf},
year = {2019},
date = {2019-01-01},
journal = {Proceedings of NAACL-HLT 2019},
pages = {517–527},
abstract = {In a corpus of data, outliers are either errors: mistakes in the data that are counterproductive, or are unique: informative samples that improve model robustness. Identifying outliers can lead to better datasets by (1) removing noise in datasets and (2) guiding collection of additional data to fill gaps. However, the problem of detecting both outlier types has received relatively little attention in NLP, particularly for dialog systems. We introduce a simple and effective technique for detecting both erroneous and unique samples in a corpus of short texts using neural sentence embeddings combined with distance-based outlier detection. We also present a novel data collection pipeline built atop our detection technique to automatically and iteratively mine unique data samples while discarding erroneous samples. Experiments show that our outlier detection technique is effective at finding errors while our data collection pipeline yields highly diverse corpora that in turn produce more robust intent classification and slot-filling models.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kannan, Ram Srivatsa; Laurenzano, Michael; Ahn, Jeongseob; Mars, Jason; Tang, Lingjia
Caliper: Interference estimator for multi-tenant environments sharing architectural resources Journal Article
In: ACM Transactions on Architecture and Code Optimization (TACO), vol. 16, no. 3, pp. 1–25, 2019.
@article{kannan2019caliper,
title = {Caliper: Interference estimator for multi-tenant environments sharing architectural resources},
author = {Ram Srivatsa Kannan and Michael Laurenzano and Jeongseob Ahn and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3323090.pdf},
year = {2019},
date = {2019-01-01},
journal = {ACM Transactions on Architecture and Code Optimization (TACO)},
volume = {16},
number = {3},
pages = {1--25},
publisher = {ACM New York, NY, USA},
abstract = {We introduce Caliper, a technique for accurately estimating performance interference occurring in shared servers. Caliper overcomes the limitations of prior approaches by leveraging a micro-experiment-based technique. In contrast to state-of-the-art approaches that focus on periodically pausing co-running applications to estimate slowdown, Caliper utilizes a strategic phase-triggered technique to capture interference due to co-location. This enables Caliper to orchestrate an accurate and low-overhead interference estimation technique that can be readily deployed in existing production systems. We evaluate Caliper for a broad spectrum of workload scenarios, demonstrating its ability to seamlessly support up to 16 applications running simultaneously and outperform the state-of-the-art approaches.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Larson, Stefan; Mahendran, Anish; Peper, Joseph J; Clarke, Christopher; Lee, Andrew; Hill, Parker; Kummerfeld, Jonathan K; Leach, Kevin; Laurenzano, Michael A; Tang, Lingjia; Mars, Jason
An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction Journal Article
In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing, pp. 1311–1316, 2019.
@article{larson2019evaluation,
title = {An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction},
author = {Stefan Larson and Anish Mahendran and Joseph J Peper and Christopher Clarke and Andrew Lee and Parker Hill and Jonathan K Kummerfeld and Kevin Leach and Michael A Laurenzano and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/D19-1131.pdf},
year = {2019},
date = {2019-01-01},
journal = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
pages = {1311–1316},
abstract = {Task-oriented dialog systems need to know when a query falls outside their range of supported intents, but current text classification corpora only define label sets that cover every example. We introduce a new dataset that includes queries that are out-of-scope---i.e., queries that do not fall into any of the system's supported intents. This poses a new challenge because models cannot assume that every query at inference time belongs to a system-supported intent class. Our dataset also covers 150 intent classes over 10 domains, capturing the breadth that a production task-oriented agent must handle. We evaluate a range of benchmark classifiers on our dataset along with several different out-of-scope identification schemes. We find that while the classifiers perform well on in-scope intent classification, they struggle to identify out-of-scope queries. Our dataset and evaluation fill an important gap in the field, offering a way of more rigorously and realistically benchmarking text classification in task-driven dialog systems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2018
Hill, Parker; Zamirai, Babak; Lu, Shengshuo; Chao, Yu-Wei; Laurenzano, Michael; Samadi, Mehrzad; Papaefthymiou, Marios; Mahlke, Scott; Wenisch, Thomas; Deng, Jia; Tang, Lingjia; Mars, Jason
Rethinking numerical representations for deep neural networks Journal Article
In: arXiv preprint arXiv:1808.02513, 2018.
@article{hill2018rethinking,
title = {Rethinking numerical representations for deep neural networks},
author = {Parker Hill and Babak Zamirai and Shengshuo Lu and Yu-Wei Chao and Michael Laurenzano and Mehrzad Samadi and Marios Papaefthymiou and Scott Mahlke and Thomas Wenisch and Jia Deng and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/1808.02513.pdf},
year = {2018},
date = {2018-01-01},
journal = {arXiv preprint arXiv:1808.02513},
abstract = {With ever-increasing computational demand for deep learning, it is critical to investigate the implications of the numeric representation and precision of DNN model weights and activations on computational efficiency. In this work, we explore unconventional narrow-precision floating-point representations as it relates to inference accuracy and efficiency to steer the improved design of future DNN platforms. We show that inference using these custom numeric representations on production-grade DNNs, including GoogLeNet and VGG, achieves an average speedup of 7.6x with less than 1% degradation in inference accuracy relative to a state-of-the-art baseline platform representing the most sophisticated hardware using single-precision floating point. To facilitate the use of such customized precision, we also present a novel technique that drastically reduces the time required to derive the optimal precision configuration. },
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Lin, Shih-Chieh; Zhang, Yunqi; Hsu, Chang-Hong; Skach, Matt; Haque, Md E; Tang, Lingjia; Mars, Jason
The architectural implications of autonomous driving: Constraints and acceleration Inproceedings
In: Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 751–766, 2018.
@inproceedings{lin2018architectural,
title = {The architectural implications of autonomous driving: Constraints and acceleration},
author = {Shih-Chieh Lin and Yunqi Zhang and Chang-Hong Hsu and Matt Skach and Md E Haque and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/AutonomousCar-ASPLOS18.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {751--766},
abstract = {Autonomous driving systems have attracted a significant amount of interest recently, and many industry leaders, such as Google, Uber, Tesla, and Mobileye, have invested a large amount of capital and engineering power on developing such systems. Building autonomous driving systems is particularly challenging due to stringent performance requirements in terms of both making the safe operational decisions and finishing processing at real-time. Despite the recent advancements in technology, such systems are still largely under experimentation and architecting end-to-end autonomous driving systems remains an open research question. To investigate this question, we first present and formalize the design constraints for building an autonomous driving system in terms of performance, predictability, storage, thermal and power. We then build an end-to-end autonomous driving system using state-of-the-art award-winning algorithms to understand the design trade-offs for building such systems. In our real-system characterization, we identify three computational bottlenecks, which conventional multicore CPUs are incapable of processing under the identified design constraints. To meet these constraints, we accelerate these algorithms using three accelerator platforms including GPUs, FPGAs, and ASICs, which can reduce the tail latency of the system by 169x, 10x, and 93x respectively. With accelerator-based designs, we are able to build an end-to-end autonomous driving system that meets all the design constraints, and explore the trade-offs among performance, power and the higher accuracy enabled by higher resolution cameras.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Hsu, Chang-Hong; Deng, Qingyuan; Mars, Jason; Tang, Lingjia
Smoothoperator: Reducing power fragmentation and improving power utilization in large-scale datacenters Inproceedings
In: Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 535–548, 2018.
@inproceedings{hsu2018smoothoperator,
title = {Smoothoperator: Reducing power fragmentation and improving power utilization in large-scale datacenters},
author = {Chang-Hong Hsu and Qingyuan Deng and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/smooth_operator.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {535--548},
abstract = {With the ever growing popularity of cloud computing and web services, Internet companies are in need of increased computing capacity to serve the demand. However, power has become a major limiting factor prohibiting the growth in industry: it is often the case that no more servers can be added to datacenters without surpassing the capacity of the existing power infrastructure.
In this work, we first investigate the power utilization in Facebook datacenters. We observe that the combination of provisioning for peak power usage, highly fluctuating traffic, and multi-level power delivery infrastructure leads to significant power budget fragmentation problem and inefficiently low power utilization. To address this issue, our insight is that heterogeneity of power consumption patterns among different services provides opportunities to re-shape the power profile of each power node by re-distributing services. By grouping services with asynchronous peak times under the same power node, we can reduce the peak power of each node and thus creating more power head-rooms to allow more servers hosted, achieving higher throughput. Based on this insight, we develop a workload-aware service placement framework to systematically spread the service instances with synchronous power patterns evenly under the power supply tree, greatly reducing the peak power draw at power nodes. We then leverage dynamic power profile reshaping to maximally utilize the headroom unlocked by our placement framework. Our experiments based on real production workload and power traces show that we are able to host up to 13% more machines in production, without changing the underlying power infrastructure. Utilizing the unleashed power headroom with dynamic reshaping, we achieve up to an estimated total of 15% and 11% throughput improvement for latency-critical service and batch service respectively at the same time, with up to 44% of energy slack reduction.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work, we first investigate the power utilization in Facebook datacenters. We observe that the combination of provisioning for peak power usage, highly fluctuating traffic, and multi-level power delivery infrastructure leads to significant power budget fragmentation problem and inefficiently low power utilization. To address this issue, our insight is that heterogeneity of power consumption patterns among different services provides opportunities to re-shape the power profile of each power node by re-distributing services. By grouping services with asynchronous peak times under the same power node, we can reduce the peak power of each node and thus creating more power head-rooms to allow more servers hosted, achieving higher throughput. Based on this insight, we develop a workload-aware service placement framework to systematically spread the service instances with synchronous power patterns evenly under the power supply tree, greatly reducing the peak power draw at power nodes. We then leverage dynamic power profile reshaping to maximally utilize the headroom unlocked by our placement framework. Our experiments based on real production workload and power traces show that we are able to host up to 13% more machines in production, without changing the underlying power infrastructure. Utilizing the unleashed power headroom with dynamic reshaping, we achieve up to an estimated total of 15% and 11% throughput improvement for latency-critical service and batch service respectively at the same time, with up to 44% of energy slack reduction.
Jain, Animesh; Phanishayee, Amar; Mars, Jason; Tang, Lingjia; Pekhimenko, Gennady
Gist: Efficient data encoding for deep neural network training Inproceedings
In: 2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA), pp. 776–789, IEEE 2018.
@inproceedings{jain2018gist,
title = {Gist: Efficient data encoding for deep neural network training},
author = {Animesh Jain and Amar Phanishayee and Jason Mars and Lingjia Tang and Gennady Pekhimenko},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08416872.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)},
pages = {776--789},
organization = {IEEE},
abstract = {Modern deep neural networks (DNNs) training typically relies on GPUs to train complex hundred-layer deep networks. A significant problem facing both researchers and industry practitioners is that, as the networks get deeper, the available GPU main memory becomes a primary bottleneck, limiting the size of networks it can train. In this paper, we investigate widely used DNNs and find that the major contributors to memory footprint are intermediate layer outputs (feature maps). We then introduce a framework for DNN-layer-specific optimizations (e.g., convolution, ReLU, pool) that significantly reduce this source of main memory pressure on GPUs. We find that a feature map typically has two uses that are spread far apart temporally. Our key approach is to store an encoded representation of feature maps for this temporal gap and decode this data for use in the backward pass; the full-fidelity feature maps are used in the forward pass and relinquished immediately. Based on this approach, we present Gist, our system that employs two classes of layer-specific encoding schemes - lossless and lossy - to exploit existing value redundancy in DNN training to significantly reduce the memory consumption of targeted feature maps. For example, one insight is by taking advantage of the computational nature of back propagation from pool to ReLU layer, we can store the intermediate feature map using just 1 bit instead of 32 bits per value. We deploy these mechanisms in a state-of-the-art DNN framework (CNTK) and observe that Gist reduces the memory footprint to upto 2× across 5 state-of-the-art image classification DNNs, with an average of 1.8× with only 4% performance overhead. We also show that further software (e.g., CuDNN) and hardware (e.g., dynamic allocation) optimizations can result in even larger footprint reduction (upto 4.1×).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kang, Yiping; Zhang, Yunqi; Kummerfeld, Jonathan K; Tang, Lingjia; Mars, Jason
Data collection for dialogue system: A startup perspective Inproceedings
In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers), pp. 33–40, 2018.
@inproceedings{kang2018data,
title = {Data collection for dialogue system: A startup perspective},
author = {Yiping Kang and Yunqi Zhang and Jonathan K Kummerfeld and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/N18-3005.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers)},
pages = {33--40},
abstract = {Industrial dialogue systems such as Apple Siri and Google Now rely on large scale diverse and robust training data to enable their sophisticated conversation capability. Crowdsourcing provides a scalable and inexpensive way of data collection but collecting high quality data efficiently requires thoughtful orchestration of the crowdsourcing jobs. Prior study of this topic have focused on tasks only in the academia settings with limited scope or only provide intrinsic dataset analysis, lacking indication on how it affects the trained model performance. In this paper, we present a study of crowdsourcing methods for a user intent classification task in our deployed dialogue system. Our task requires classification of 47 possible user intents and contains many intent pairs with subtle differences. We consider different crowdsourcing job types and job prompts and analyze quantitatively the quality of the collected data and the downstream model performance on a test set of real user queries from production logs. Our observation provides insights into designing efficient crowdsourcing jobs and provide recommendations for future dialogue system data collection process.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kannan, Ram Srivatsa; Jain, Animesh; Laurenzano, Michael A; Tang, Lingjia; Mars, Jason
Proctor: Detecting and investigating interference in shared datacenters Inproceedings
In: 2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 76–86, IEEE 2018.
@inproceedings{kannan2018proctor,
title = {Proctor: Detecting and investigating interference in shared datacenters},
author = {Ram Srivatsa Kannan and Animesh Jain and Michael A Laurenzano and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08366937.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
pages = {76--86},
organization = {IEEE},
abstract = {Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance intrusion and their root causes.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Skach, Matt; Arora, Manish; Tullsen, Dean; Tang, Lingjia; Mars, Jason
Virtual melting temperature: managing server load to minimize cooling overhead with phase change materials Inproceedings
In: 2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA), pp. 15–28, IEEE 2018.
@inproceedings{skach2018virtual,
title = {Virtual melting temperature: managing server load to minimize cooling overhead with phase change materials},
author = {Matt Skach and Manish Arora and Dean Tullsen and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/08416815.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)},
pages = {15--28},
organization = {IEEE},
abstract = {As the power density and power consumption of large scale datacenters continue to grow, the challenges of removing heat from these datacenters and keeping them cool is an increasingly urgent and costly. With the largest datacenters now exceeding over 200 MW of power, the cooling systems that prevent overheating cost on the order of tens of millions of dollars. Prior work proposed to deploy phase change materials (PCM) and use Thermal Time Shifting (TTS) to reshape the thermal load of a datacenter by storing heat during peak hours of high utilization and releasing it during off hours when utilization is low, enabling a smaller cooling system to handle the same peak load. The peak cooling load reduction enabled by TTS is greatly beneficial, however TTS is a passive system that cannot handle many workload mixtures or adapt to changing load or environmental characteristics. In this work we propose VMT, a thermal aware job placement technique that adds an active, tunable component to enable greater control over datacenter thermal output. We propose two different job placement algorithms for VMT and perform a scale out study of VMT in a simulated server cluster. We provide analysis of the use cases and trade-offs of each algorithm, and show that VMT reduces peak cooling load by up to 12.8% to provide over two million dollars in cost savings when a smaller cooling system is installed, or allows for over 7,000 additional servers to be added in scenarios where TTS is ineffective.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Lin, Shih-Chieh; Hsu, Chang-Hong; Talamonti, Walter; Zhang, Yunqi; Oney, Steve; Mars, Jason; Tang, Lingjia
Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features Inproceedings
In: Proceedings of the 31st Annual ACM Symposium on User Interface Software and Technology, pp. 531–542, 2018.
@inproceedings{lin2018adasa,
title = {Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features},
author = {Shih-Chieh Lin and Chang-Hong Hsu and Walter Talamonti and Yunqi Zhang and Steve Oney and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/lin2018adasa.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 31st Annual ACM Symposium on User Interface Software and Technology},
pages = {531--542},
abstract = {Advanced Driver Assistance Systems (ADAS) come equipped on most modern vehicles and are intended to assist the driver and enhance the driving experience through features such as lane keeping system and adaptive cruise control. However, recent studies show that few people utilize these features for several reasons. First, ADAS features were not common until recently. Second, most users are unfamiliar with these features and do not know what to expect. Finally, the interface for operating these features is not intuitive. To help drivers understand ADAS features, we present a conversational in-vehicle digital assistant that responds to drivers' questions and commands in natural language. With the system prototyped herein, drivers can ask questions or command using unconstrained natural language in the vehicle, and the assistant trained by using advanced machine learning techniques, coupled with access to vehicle signals, responds in real-time based on conversational context. Results of our system prototyped on a production vehicle are presented, demonstrating its effectiveness in improving driver understanding and usability of ADAS.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jain, Animesh; Laurenzano, Michael A; Pokam, Gilles A; Mars, Jason; Tang, Lingjia
Architectural support for convolutional neural networks on modern CPUs Inproceedings
In: Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques, pp. 1–13, 2018.
@inproceedings{jain2018architectural,
title = {Architectural support for convolutional neural networks on modern CPUs},
author = {Animesh Jain and Michael A Laurenzano and Gilles A Pokam and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3243176.3243177.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
pages = {1--13},
abstract = {A key focus of recent work in our community has been on devising increasingly sophisticated acceleration devices for deep neural network (DNN) computation, especially for networks driven by convolution layers. Yet, despite the promise of substantial improvements in performance and energy consumption offered by these approaches, general purpose computing is not going away because its traditional well-understood programming model and continued wide deployment. Therefore, the question arises as to what can be done, if anything, to evolve conventional CPUs to accommodate efficient deep neural network computation.
This work focuses on the challenging problem of identifying and alleviating the performance bottlenecks for convolution layer computation for conventional CPU platforms. We begin by performing a detailed study of a range of CNN-based applications on a modern CPU microarchitecture, finding that designing a physical register file (PRF) capable of feeding computational units is the primary barrier that prevents the addition of more compute units in the CPU, limiting the performance improvements that can be achieved by CPU on convolution layers. We present the design of a novel, minimally intrusive set of microarchitectural and ISA extensions that address this problem and describe the code generation support needed to take advantage our design. Through a detailed evaluation that covers 5 state-of-the-art neural network applications, we observe that applying these extensions allows packing more compute in the CPU while keeping PRF energy in check, achieving a 2× performance improvement and a 2.7× energy-delay product improvement against a popular Intel Haswell server processor baseline.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
This work focuses on the challenging problem of identifying and alleviating the performance bottlenecks for convolution layer computation for conventional CPU platforms. We begin by performing a detailed study of a range of CNN-based applications on a modern CPU microarchitecture, finding that designing a physical register file (PRF) capable of feeding computational units is the primary barrier that prevents the addition of more compute units in the CPU, limiting the performance improvements that can be achieved by CPU on convolution layers. We present the design of a novel, minimally intrusive set of microarchitectural and ISA extensions that address this problem and describe the code generation support needed to take advantage our design. Through a detailed evaluation that covers 5 state-of-the-art neural network applications, we observe that applying these extensions allows packing more compute in the CPU while keeping PRF energy in check, achieving a 2× performance improvement and a 2.7× energy-delay product improvement against a popular Intel Haswell server processor baseline.
2017
Hsu, Chang-Hong; Zhang, Yunqi; Laurenzano, Michael A; Meisner, David; Wenisch, Thomas; Dreslinski, Ronald G; Mars, Jason; Tang, Lingjia
Reining in long tails in warehouse-scale computers with quick voltage boosting using adrenaline Journal Article
In: ACM Transactions on Computer Systems (TOCS), vol. 35, no. 1, pp. 1–33, 2017.
@article{hsu2017reining,
title = {Reining in long tails in warehouse-scale computers with quick voltage boosting using adrenaline},
author = {Chang-Hong Hsu and Yunqi Zhang and Michael A Laurenzano and David Meisner and Thomas Wenisch and Ronald G Dreslinski and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3054742.pdf},
year = {2017},
date = {2017-01-01},
journal = {ACM Transactions on Computer Systems (TOCS)},
volume = {35},
number = {1},
pages = {1--33},
publisher = {ACM New York, NY, USA},
abstract = {Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service (QoS) of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor’s voltage and frequency during a coarse-grained sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer-granularity (tens of nanoseconds) voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50 × tail latency improvement for Memcached and up to a 3.03 × for Web Search over coarse-grained dynamic voltage and frequency scaling (DVFS) given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81 × improvement for Memcached and up to a 1.99 × for Web Search over coarse-grained DVFS. By using the carefully chosen boost thresholds, Adrenaline further improves the tail latency reduction to 4.82 × over coarse-grained DVFS.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kang, Yiping; Hauswald, Johann; Gao, Cao; Rovinski, Austin; Mudge, Trevor; Mars, Jason; Tang, Lingjia
Neurosurgeon: Collaborative intelligence between the cloud and mobile edge Journal Article
In: ACM SIGARCH Computer Architecture News, vol. 45, no. 1, pp. 615–629, 2017.
@article{kang2017neurosurgeon,
title = {Neurosurgeon: Collaborative intelligence between the cloud and mobile edge},
author = {Yiping Kang and Johann Hauswald and Cao Gao and Austin Rovinski and Trevor Mudge and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3037697.3037698.pdf},
year = {2017},
date = {2017-01-01},
journal = {ACM SIGARCH Computer Architecture News},
volume = {45},
number = {1},
pages = {615--629},
publisher = {ACM New York, NY, USA},
abstract = {The computation for today's intelligent personal assistants such as Apple Siri, Google Now, and Microsoft Cortana, is performed in the cloud. This cloud-only approach requires significant amounts of data to be sent to the cloud over the wireless network and puts significant computational pressure on the datacenter. However, as the computational resources in mobile devices become more powerful and energy efficient, questions arise as to whether this cloud-only processing is desirable moving forward, and what are the implications of pushing some or all of this compute to the mobile devices on the edge.
In this paper, we examine the status quo approach of cloud-only processing and investigate computation partitioning strategies that effectively leverage both the cycles in the cloud and on the mobile device to achieve low latency, low energy consumption, and high datacenter throughput for this class of intelligent applications. Our study uses 8 intelligent applications spanning computer vision, speech, and natural language domains, all employing state-of-the-art Deep Neural Networks (DNNs) as the core machine learning technique. We find that given the characteristics of DNN algorithms, a fine-grained, layer-level computation partitioning strategy based on the data and computation variations of each layer within a DNN has significant latency and energy advantages over the status quo approach.
Using this insight, we design Neurosurgeon, a lightweight scheduler to automatically partition DNN computation between mobile devices and datacenters at the granularity of neural network layers. Neurosurgeon does not require per-application profiling. It adapts to various DNN architectures, hardware platforms, wireless networks, and server load levels, intelligently partitioning computation for best latency or best mobile energy. We evaluate Neurosurgeon on a state-of-the-art mobile development platform and show that it improves end-to-end latency by 3.1X on average and up to 40.7X, reduces mobile energy consumption by 59.5% on average and up to 94.7%, and improves datacenter throughput by 1.5X on average and up to 6.7X.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
In this paper, we examine the status quo approach of cloud-only processing and investigate computation partitioning strategies that effectively leverage both the cycles in the cloud and on the mobile device to achieve low latency, low energy consumption, and high datacenter throughput for this class of intelligent applications. Our study uses 8 intelligent applications spanning computer vision, speech, and natural language domains, all employing state-of-the-art Deep Neural Networks (DNNs) as the core machine learning technique. We find that given the characteristics of DNN algorithms, a fine-grained, layer-level computation partitioning strategy based on the data and computation variations of each layer within a DNN has significant latency and energy advantages over the status quo approach.
Using this insight, we design Neurosurgeon, a lightweight scheduler to automatically partition DNN computation between mobile devices and datacenters at the granularity of neural network layers. Neurosurgeon does not require per-application profiling. It adapts to various DNN architectures, hardware platforms, wireless networks, and server load levels, intelligently partitioning computation for best latency or best mobile energy. We evaluate Neurosurgeon on a state-of-the-art mobile development platform and show that it improves end-to-end latency by 3.1X on average and up to 40.7X, reduces mobile energy consumption by 59.5% on average and up to 94.7%, and improves datacenter throughput by 1.5X on average and up to 6.7X.
Chen, Quan; Yang, Hailong; Guo, Minyi; Kannan, Ram Srivatsa; Mars, Jason; Tang, Lingjia
Prophet: Precise qos prediction on non-preemptive accelerators to improve utilization in warehouse-scale computers Inproceedings
In: Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 17–32, 2017.
@inproceedings{chen2017prophet,
title = {Prophet: Precise qos prediction on non-preemptive accelerators to improve utilization in warehouse-scale computers},
author = {Quan Chen and Hailong Yang and Minyi Guo and Ram Srivatsa Kannan and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3093336.3037700.pdf},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {17--32},
abstract = {Guaranteeing Quality-of-Service (QoS) of latency-sensitive applications while improving server utilization through application co-location is important yet challenging in modern datacenters. The key challenge is that when applications are co-located on a server, performance interference due to resource contention can be detrimental to the application QoS. Although prior work has proposed techniques to identify "safe" co-locations where application QoS is satisfied by predicting the performance interference on multicores, no such prediction technique on accelerators such as GPUs.
In this work, we present Prophet, an approach to precisely predict the performance degradation of latency-sensitive applications on accelerators due to application co-location. We analyzed the performance interference on accelerators through a real system investigation and found that unlike on multicores where the key contentious resources are shared caches and main memory bandwidth, the key contentious resources on accelerators are instead processing elements, accelerator memory bandwidth and PCIe bandwidth. Based on this observation, we designed interference models that enable the precise prediction for processing element, accelerator memory bandwidth and PCIe bandwidth contention on real hardware. By using a novel technique to forecast solo-run execution traces of the co-located applications using interference models, Prophet can accurately predict the performance degradation of latency-sensitive applications on non-preemptive accelerators. Using Prophet, we can identify "safe" co-locations on accelerators to improve utilization without violating the QoS target. Our evaluation shows that Prophet can predict the performance degradation with an average prediction error 5.47% on real systems. Meanwhile, based on the prediction, Prophet achieves accelerator utilization improvements of 49.9% on average while maintaining the QoS target of latency-sensitive applications.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work, we present Prophet, an approach to precisely predict the performance degradation of latency-sensitive applications on accelerators due to application co-location. We analyzed the performance interference on accelerators through a real system investigation and found that unlike on multicores where the key contentious resources are shared caches and main memory bandwidth, the key contentious resources on accelerators are instead processing elements, accelerator memory bandwidth and PCIe bandwidth. Based on this observation, we designed interference models that enable the precise prediction for processing element, accelerator memory bandwidth and PCIe bandwidth contention on real hardware. By using a novel technique to forecast solo-run execution traces of the co-located applications using interference models, Prophet can accurately predict the performance degradation of latency-sensitive applications on non-preemptive accelerators. Using Prophet, we can identify "safe" co-locations on accelerators to improve utilization without violating the QoS target. Our evaluation shows that Prophet can predict the performance degradation with an average prediction error 5.47% on real systems. Meanwhile, based on the prediction, Prophet achieves accelerator utilization improvements of 49.9% on average while maintaining the QoS target of latency-sensitive applications.
Yang, Hailong; Chen, Quan; Riaz, Moeiz; Luan, Zhongzhi; Tang, Lingjia; Mars, Jason
Powerchief: Intelligent power allocation for multi-stage applications to improve responsiveness on power constrained cmp Inproceedings
In: Proceedings of the 44th Annual International Symposium on Computer Architecture, pp. 133–146, 2017.
@inproceedings{yang2017powerchief,
title = {Powerchief: Intelligent power allocation for multi-stage applications to improve responsiveness on power constrained cmp},
author = {Hailong Yang and Quan Chen and Moeiz Riaz and Zhongzhi Luan and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/04/3079856.3080224.pdf},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
pages = {133--146},
abstract = {Modern user facing applications consist of multiple processing stages with a number of service instances in each stage. The latency profile of these multi-stage applications is intrinsically variable, making it challenging to provide satisfactory responsiveness. Given a limited power budget, improving the end-to-end latency requires intelligently boosting the bottleneck service across stages using multiple boosting techniques. However, prior work fail to acknowledge the multi-stage nature of user-facing applications and perform poorly in improving responsiveness on power constrained CMP, as they are unable to accurately identify bottleneck service and apply the boosting techniques adaptively.
In this paper, we present PowerChief, a runtime framework that 1) provides joint design of service and query to monitor the latency statistics across service stages and accurately identifies the bottleneck service during runtime; 2) adaptively chooses the boosting technique to accelerate the bottleneck service with improved responsiveness; 3) dynamically reallocates the constrained power budget across service stages to accommodate the chosen boosting technique. Evaluated with real world multi-stage applications, PowerChief improves the average latency by 20.3x and 32.4x (99% tail latency by 13.3x and 19.4x) for Sirius and Natural Language Processing applications respectively compared to stage-agnostic power allocation. In addition, for the given QoS target, PowerChief reduces the power consumption of Sirius and Web Search applications by 23% and 33% respectively over prior work.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we present PowerChief, a runtime framework that 1) provides joint design of service and query to monitor the latency statistics across service stages and accurately identifies the bottleneck service during runtime; 2) adaptively chooses the boosting technique to accelerate the bottleneck service with improved responsiveness; 3) dynamically reallocates the constrained power budget across service stages to accommodate the chosen boosting technique. Evaluated with real world multi-stage applications, PowerChief improves the average latency by 20.3x and 32.4x (99% tail latency by 13.3x and 19.4x) for Sirius and Natural Language Processing applications respectively compared to stage-agnostic power allocation. In addition, for the given QoS target, PowerChief reduces the power consumption of Sirius and Web Search applications by 23% and 33% respectively over prior work.