2015
Muneeb Khan; Michael A Laurenzanoy; Jason Marsy; Erik Hagersten; David Black-Schaffer
AREP: Adaptive resource efficient prefetching for maximizing multicore performance Proceedings Article
In: 2015 International Conference on Parallel Architecture and Compilation (PACT), pp. 367–378, IEEE 2015.
@inproceedings{khan2015arep,
title = {AREP: Adaptive resource efficient prefetching for maximizing multicore performance},
author = {Muneeb Khan and Michael A Laurenzanoy and Jason Marsy and Erik Hagersten and David Black-Schaffer},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07429320.pdf},
year = {2015},
date = {2015-01-01},
booktitle = {2015 International Conference on Parallel Architecture and Compilation (PACT)},
pages = {367--378},
organization = {IEEE},
abstract = {Modern processors widely use hardware prefetching to hide memory latency. While aggressive hardware prefetchers can improve performance significantly for some applications, they can limit the overall performance in highly-utilized multicore processors by saturating the offchip bandwidth and wasting last-level cache capacity. Co-executing applications can slowdown due to contention over these shared resources. This work introduces Adaptive Resource Efficient Prefetching (AREP) -- a runtime framework that dynamically combines software prefetching and hardware prefetching to maximize throughput in highly utilized multicore processors. AREP achieves better performance by prefetching data in a resource efficient way -- conserving offchip-bandwidth and last-level cache capacity with accurate prefetching and by applying cache-bypassing when possible. AREP dynamically explores a mix of hardware/software prefetching policies, then selects and applies the best performing policy. AREP is phase-aware and re-explores (at runtime) for the best prefetching policy at phase boundaries. A multitude of experiments with workload mixes and parallel applications on a modern high performance multicore show that AREP can increase throughput by up to 49% (8.1% on average). This is complemented by improved fairness, resulting in average quality of service above 94%.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2014
Yan Zhai; Xiao Zhang; Stephane Eranian; Lingjia Tang; Jason Mars
Happy: Hyperthread-aware power profiling dynamically Proceedings Article
In: 2014 USENIX Annual Technical Conference (USENIX ATC 2014), pp. 211–217, 2014.
@inproceedings{zhai2014happy,
title = {Happy: Hyperthread-aware power profiling dynamically},
author = {Yan Zhai and Xiao Zhang and Stephane Eranian and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/atc14-paper-zhai.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {2014 USENIX Annual Technical Conference (USENIX ATC 2014)},
pages = {211--217},
abstract = {Quantifying the power consumption of individual applications co-running on a single server is a critical component for software-based power capping, scheduling, and provisioning techniques in modern datacenters. However, with the proliferation of hyperthreading in the last few generations of server-grade processor designs, the challenge of accurately and dynamically performing this power attribution to individual threads has been significantly exacerbated. Due to the sharing of core-level resources such as functional units, prior techniques are not suitable to attribute the power consumption between hyperthreads sharing a physical core.
In this paper, we present a runtime mechanism that quantifies and attributes power consumption to individual jobs at fine granularity. Specifically, we introduce a hyperthread-aware power model that differentiates between the states when both hardware threads of a core are in use, and when only one thread is in use. By capturing these two different states, we are able to accurately attribute power to each logical CPU in modern servers. We conducted experiments with several Google production workloads on an Intel Sandy Bridge server. Compared to prior hyperthread-oblivious model, HaPPy is substantially more accurate, reducing the prediction error from 20.5% to 7.5% on average and from 31.5% to 9.4% in the worst case.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we present a runtime mechanism that quantifies and attributes power consumption to individual jobs at fine granularity. Specifically, we introduce a hyperthread-aware power model that differentiates between the states when both hardware threads of a core are in use, and when only one thread is in use. By capturing these two different states, we are able to accurately attribute power to each logical CPU in modern servers. We conducted experiments with several Google production workloads on an Intel Sandy Bridge server. Compared to prior hyperthread-oblivious model, HaPPy is substantially more accurate, reducing the prediction error from 20.5% to 7.5% on average and from 31.5% to 9.4% in the worst case.
Yunqi Zhang; Michael A Laurenzano; Jason Mars; Lingjia Tang
Smite: Precise qos prediction on real-system smt processors to improve utilization in warehouse scale computers Proceedings Article
In: 2014 47th Annual IEEE/ACM International Symposium on Microarchitecture, pp. 406–418, IEEE 2014.
@inproceedings{zhang2014smite,
title = {Smite: Precise qos prediction on real-system smt processors to improve utilization in warehouse scale computers},
author = {Yunqi Zhang and Michael A Laurenzano and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/07011405.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {2014 47th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {406--418},
organization = {IEEE},
abstract = {One of the key challenges for improving efficiency in warehouse scale computers (WSCs) is to improve server utilization while guaranteeing the quality of service (QoS) of latency-sensitive applications. To this end, prior work has proposed techniques to precisely predict performance and QoS interference to identify 'safe' application co-locations. However, such techniques are only applicable to resources shared across cores. Achieving such precise interference prediction on real-system simultaneous multithreading (SMT) architectures has been a significantly challenging open problem due to the complexity introduced by sharing resources within a core. In this paper, we demonstrate through a real-system investigation that the fundamental difference between resource sharing behaviors on CMP and SMT architectures calls for a redesign of the way we model interference. For SMT servers, the interference on different shared resources, including private caches, memory ports, as well as integer and floating-point functional units, do not correlate with each other. This insight suggests the necessity of decoupling interference into multiple resource sharing dimensions. In this work, we propose SMiTe, a methodology that enables precise performance prediction for SMT co-location on real-system commodity processors. With a set of Rulers, which are carefully designed software stressors that apply pressure to a multidimensional space of shared resources, we quantify application sensitivity and contentiousness in a decoupled manner. We then establish a regression model to combine the sensitivity and contentiousness in different dimensions to predict performance interference. Using this methodology, we are able to precisely predict the performance interference in SMT co-location with an average error of 2.80% on SPEC CPU2006 and 1.79% on Cloud Suite. Our evaluation shows that SMiTe allows us to improve the utilization of WSCs by up to 42.57% while enforcing an application's QoS requirements.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
M Laurenzano; Yunqi Zhang; Lingjia Tang; Jason Mars
Protean code: Achieving near-free online code transformations for warehouse scale computers Proceedings Article
In: Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO), 2014.
@inproceedings{laurenzano2014protean,
title = {Protean code: Achieving near-free online code transformations for warehouse scale computers},
author = {M Laurenzano and Yunqi Zhang and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/MICRO.2014.21.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
abstract = {Rampant dynamism due to load fluctuations, co-runner changes, and varying levels of interference poses a threat to application quality of service (QoS) and has limited our ability to allow co-locations in modern warehouse scale computers (WSCs). Instruction set features such as the non-temporal memory access hints found in modern ISAs (both ARM and x86) may be useful in mitigating these effects. However, despite the challenge of this dynamism and the availability of an instruction set mechanism that might help address the problem, a key capability missing in the system software stack in modern WSCs is the ability to dynamically transform (and re-transform) the executing application code to apply these instruction set features when necessary.
In this work we introduce protean code, a novel approach for enacting arbitrary compiler transformations at runtime for native programs running on commodity hardware with negligible (<1%) overhead. The fundamental insight behind the underlying mechanism of protean code is that, instead of maintaining full control throughout the program's execution as with traditional dynamic optimizers, protean code allows the original binary to execute continuously and diverts control flow only at a set of virtualized points, allowing rapid and seamless rerouting to the new code variants. In addition, the protean code compiler embeds IR with high-level semantic information into the program, empowering the dynamic compiler to perform rich analysis and transformations online with little overhead. Using a fully functional protean code compiler and runtime built on LLVM, we design PC3D, Protean Code for Cache Contention in Datacenters. PC3D dynamically employs non-temporal access hints to achieve utilization improvements of up to 2.8x (1.5x on average) higher than state-of-the-art contention mitigation runtime techniques at a QoS target of 98%.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work we introduce protean code, a novel approach for enacting arbitrary compiler transformations at runtime for native programs running on commodity hardware with negligible (<1%) overhead. The fundamental insight behind the underlying mechanism of protean code is that, instead of maintaining full control throughout the program's execution as with traditional dynamic optimizers, protean code allows the original binary to execute continuously and diverts control flow only at a set of virtualized points, allowing rapid and seamless rerouting to the new code variants. In addition, the protean code compiler embeds IR with high-level semantic information into the program, empowering the dynamic compiler to perform rich analysis and transformations online with little overhead. Using a fully functional protean code compiler and runtime built on LLVM, we design PC3D, Protean Code for Cache Contention in Datacenters. PC3D dynamically employs non-temporal access hints to achieve utilization improvements of up to 2.8x (1.5x on average) higher than state-of-the-art contention mitigation runtime techniques at a QoS target of 98%.
Alex D Breslow; Ananta Tiwari; Martin Schulz; Laura Carrington; Lingjia Tang; Jason Mars
Enabling fair pricing on high performance computer systems with node sharing Journal Article
In: Scientific Programming, vol. 22, no. 2, pp. 59–74, 2014.
@article{breslow2014enabling,
title = {Enabling fair pricing on high performance computer systems with node sharing},
author = {Alex D Breslow and Ananta Tiwari and Martin Schulz and Laura Carrington and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2503210.2503256.pdf},
year = {2014},
date = {2014-01-01},
journal = {Scientific Programming},
volume = {22},
number = {2},
pages = {59--74},
publisher = {IOS Press},
abstract = {Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10--20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location.This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism --a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners --to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2013
Lingjia Tang; Jason Mars; Wei Wang; Tanima Dey; Mary Lou Soffa
Reqos: Reactive static/dynamic compilation for qos in warehouse scale computers Journal Article
In: ACM SIGPLAN Notices, vol. 48, no. 4, pp. 89–100, 2013.
@article{tang2013reqos,
title = {Reqos: Reactive static/dynamic compilation for qos in warehouse scale computers},
author = {Lingjia Tang and Jason Mars and Wei Wang and Tanima Dey and Mary Lou Soffa},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2490301.2451126.pdf},
year = {2013},
date = {2013-01-01},
journal = {ACM SIGPLAN Notices},
volume = {48},
number = {4},
pages = {89--100},
publisher = {ACM New York, NY, USA},
abstract = {As multicore processors with expanding core counts continue to dominate the server market, the overall utilization of the class of datacenters known as warehouse scale computers (WSCs) depends heavily on colocation of multiple workloads on each server to take advantage of the computational power provided by modern processors. However, many of the applications running in WSCs, such as websearch, are user-facing and have quality of service (QoS) requirements. When multiple applications are co-located on a multicore machine, contention for shared memory resources threatens application QoS as severe cross-core performance interference may occur. WSC operators are left with two options: either disregard QoS to maximize WSC utilization, or disallow the co-location of high-priority user-facing applications with other applications, resulting in low machine utilization and millions of dollars wasted.
This paper presents ReQoS, a static/dynamic compilation approach that enables low-priority applications to adaptively manipulate their own contentiousness to ensure the QoS of high-priority co-runners. ReQoS is composed of a profile guided compilation technique that identifies and inserts markers in contentious code regions in low-priority applications, and a lightweight runtime that monitors the QoS of high-priority applications and reactively reduces the pressure low-priority applications generate to the memory subsystem when cross-core interference is detected. In this work, we show that ReQoS can accurately diagnose contention and significantly reduce performance interference to ensure application QoS. Applying ReQoS to SPEC2006 and SmashBench workloads on real multicore machines, we are able to improve machine utilization by more than 70% in many cases, and more than 50% on average, while enforcing a 90% QoS threshold. We are also able to improve the energy efficiency of modern multicore machines by 47% on average over a policy of disallowing co-locations.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
This paper presents ReQoS, a static/dynamic compilation approach that enables low-priority applications to adaptively manipulate their own contentiousness to ensure the QoS of high-priority co-runners. ReQoS is composed of a profile guided compilation technique that identifies and inserts markers in contentious code regions in low-priority applications, and a lightweight runtime that monitors the QoS of high-priority applications and reactively reduces the pressure low-priority applications generate to the memory subsystem when cross-core interference is detected. In this work, we show that ReQoS can accurately diagnose contention and significantly reduce performance interference to ensure application QoS. Applying ReQoS to SPEC2006 and SmashBench workloads on real multicore machines, we are able to improve machine utilization by more than 70% in many cases, and more than 50% on average, while enforcing a 90% QoS threshold. We are also able to improve the energy efficiency of modern multicore machines by 47% on average over a policy of disallowing co-locations.
Lingjia Tang; Jason Mars; Xiao Zhang; Robert Hagmann; Robert Hundt; Eric Tune
Optimizing Google's warehouse scale computers: The NUMA experience Proceedings Article
In: 2013 IEEE 19th International Symposium on High Performance Computer Architecture (HPCA), pp. 188–197, IEEE 2013.
@inproceedings{tang2013optimizing,
title = {Optimizing Google's warehouse scale computers: The NUMA experience},
author = {Lingjia Tang and Jason Mars and Xiao Zhang and Robert Hagmann and Robert Hundt and Eric Tune},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/06522318.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {2013 IEEE 19th International Symposium on High Performance Computer Architecture (HPCA)},
pages = {188--197},
organization = {IEEE},
abstract = {Due to the complexity and the massive scale of modern warehouse scale computers (WSCs), it is challenging to quantify the performance impact of individual microarchitectural properties and the potential optimization benefits in the production environment. As a result of these challenges, there is currently a lack of understanding of the microarchitecture-workload interaction, leaving potentially significant performance on the table. This paper argues for a two-phase performance analysis methodology for optimizing WSCs that combines both an in-production investigation and an experimental load-testing study. To demonstrate the effectiveness of this two-phase approach, and to illustrate the challenges, methodologies and opportunities in optimizing modern WSCs, this paper investigates the impact of non-uniform memory access (NUMA) for several Google's key web-service workloads in large-scale production WSCs. Leveraging a newly-designed metric and continuous large-scale profiling in live datacenters, our production analysis demonstrates that NUMA has a significant impact (10-20%) on two important web-services: Gmail backend and web-search frontend. Our carefully designed load-test further reveals surprising tradeoffs between optimizing for NUMA performance and reducing cache contention.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Hailong Yang; Alex Breslow; Jason Mars; Lingjia Tang
Bubble-flux: Precise online qos management for increased utilization in warehouse scale computers Journal Article
In: ACM SIGARCH Computer Architecture News, vol. 41, no. 3, pp. 607–618, 2013.
@article{yang2013bubble,
title = {Bubble-flux: Precise online qos management for increased utilization in warehouse scale computers},
author = {Hailong Yang and Alex Breslow and Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2508148.2485974.pdf},
year = {2013},
date = {2013-01-01},
journal = {ACM SIGARCH Computer Architecture News},
volume = {41},
number = {3},
pages = {607--618},
publisher = {ACM New York, NY, USA},
abstract = {Ensuring the quality of service (QoS) for latency-sensitive applications while allowing co-locations of multiple applications on servers is critical for improving server utilization and reducing cost in modern warehouse-scale computers (WSCs). Recent work relies on static profiling to precisely predict the QoS degradation that results from performance interference among co-running applications to increase the number of "safe" co-locations. However, these static profiling techniques have several critical limitations: 1) a priori knowledge of all workloads is required for profiling, 2) it is difficult for the prediction to capture or adapt to phase or load changes of applications, and 3) the prediction technique is limited to only two co-running applications.
To address all of these limitations, we present Bubble-Flux, an integrated dynamic interference measurement and online QoS management mechanism to provide accurate QoS control and maximize server utilization. Bubble-Flux uses a Dynamic Bubble to probe servers in real time to measure the instantaneous pressure on the shared hardware resources and precisely predict how the QoS of a latency-sensitive job will be affected by potential co-runners. Once "safe" batch jobs are selected and mapped to a server, Bubble-Flux uses an Online Flux Engine to continuously monitor the QoS of the latency-sensitive application and control the execution of batch jobs to adapt to dynamic input, phase, and load changes to deliver satisfactory QoS. Batch applications remain in a state of flux throughout execution. Our results show that the utilization improvement achieved by Bubble-Flux is up to 2.2x better than the prior static approach.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
To address all of these limitations, we present Bubble-Flux, an integrated dynamic interference measurement and online QoS management mechanism to provide accurate QoS control and maximize server utilization. Bubble-Flux uses a Dynamic Bubble to probe servers in real time to measure the instantaneous pressure on the shared hardware resources and precisely predict how the QoS of a latency-sensitive job will be affected by potential co-runners. Once "safe" batch jobs are selected and mapped to a server, Bubble-Flux uses an Online Flux Engine to continuously monitor the QoS of the latency-sensitive application and control the execution of batch jobs to adapt to dynamic input, phase, and load changes to deliver satisfactory QoS. Batch applications remain in a state of flux throughout execution. Our results show that the utilization improvement achieved by Bubble-Flux is up to 2.2x better than the prior static approach.
Jason Mars; Lingjia Tang
Whare-map: heterogeneity in "homogeneous" warehouse-scale computers Proceedings Article
In: Proceedings of the 40th Annual International Symposium on Computer Architecture, pp. 619–630, 2013.
@inproceedings{mars2013whare,
title = {Whare-map: heterogeneity in "homogeneous" warehouse-scale computers},
author = {Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2508148.2485975.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 40th Annual International Symposium on Computer Architecture},
pages = {619--630},
abstract = {Modern "warehouse scale computers" (WSCs) continue to be embraced as homogeneous computing platforms. However, due to frequent machine replacements and upgrades, modern WSCs are in fact composed of diverse commodity microarchitectures and machine configurations. Yet, current WSCs are architected with the assumption of homogeneity, leaving a potentially significant performance opportunity unexplored.
In this paper, we expose and quantify the performance impact of the "homogeneity assumption" for modern production WSCs using industry-strength large-scale web-service workloads. In addition, we argue for, and evaluate the benefits of, a heterogeneity-aware WSC using commercial web-service production workloads including Google's web-search. We also identify key factors impacting the available performance opportunity when exploiting heterogeneity and introduce a new metric, opportunity factor, to quantify an application's sensitivity to the heterogeneity in a given WSC. To exploit heterogeneity in "homogeneous" WSCs, we propose "Whare-Map," the WSC Heterogeneity Aware Mapper that leverages already in-place continuous profiling subsystems found in production environments. When employing "Whare-Map", we observe a cluster-wide performance improvement of 15% on average over heterogeneity--oblivious job placement and up to an 80% improvement for web-service applications that are particularly sensitive to heterogeneity.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we expose and quantify the performance impact of the "homogeneity assumption" for modern production WSCs using industry-strength large-scale web-service workloads. In addition, we argue for, and evaluate the benefits of, a heterogeneity-aware WSC using commercial web-service production workloads including Google's web-search. We also identify key factors impacting the available performance opportunity when exploiting heterogeneity and introduce a new metric, opportunity factor, to quantify an application's sensitivity to the heterogeneity in a given WSC. To exploit heterogeneity in "homogeneous" WSCs, we propose "Whare-Map," the WSC Heterogeneity Aware Mapper that leverages already in-place continuous profiling subsystems found in production environments. When employing "Whare-Map", we observe a cluster-wide performance improvement of 15% on average over heterogeneity--oblivious job placement and up to an 80% improvement for web-service applications that are particularly sensitive to heterogeneity.
Jason Mars; Lingjia Tang
Understanding application contentiousness and sensitivity on modern multicores Book Section
In: Advances in Computers, vol. 91, pp. 59–85, Elsevier, 2013.
@incollection{mars2013understanding,
title = {Understanding application contentiousness and sensitivity on modern multicores},
author = {Jason Mars and Lingjia Tang},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/1-s2.0-B9780124080898000021-main.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {Advances in Computers},
volume = {91},
pages = {59--85},
publisher = {Elsevier},
abstract = {Runtime systems to mitigate memory resource contention problems on multicore processors have recently attracted much research attention. One critical component of these runtimes is the indicators to rank and classify applications based on their contention characteristics. However, although there has been significant research effort, application contention characteristics remain not well understood and indicators have not been thoroughly evaluated. In this chapter, we performed a thorough study of applications' contention characteristics to develop better indicators to improve contention-aware runtime systems. The contention characteristics are composed of an application's contentiousness, and its sensitivity to contention. We show that contentiousness and sensitivity are not strongly correlated, and contrary to prior wisdom, a single indicator is not adequate to predict both. Also, while prior wisdom has relied on last level cache miss rate as one of the best indicators to predict an application's contention characteristics, we show that depending on the workloads, it can often be misleading. We then present prediction models that consider contention in various memory resources. Our regression analysis establishes an accurate model to predict application contentiousness. The analysis also demonstrates that performance counters alone may not be sufficient to accurately predict application sensitivity to contention. In this chapter, we also present an evaluation using SPEC CPU2006 benchmarks showing that, when predicting an application's contentiousness, the linear correlation coefficient R2 of our predictor and the real measured contentiousness is 0.834, as opposed to 0.224 when using last level cache miss rate.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
Alex D Breslow; Ananta Tiwari; Martin Schulz; Laura Carrington; Lingjia Tang; Jason Mars
Enabling fair pricing on HPC systems with node sharing Proceedings Article
In: Proceedings of the international conference on high performance computing, networking, storage and analysis, pp. 1–12, 2013.
@inproceedings{breslow2013enabling,
title = {Enabling fair pricing on HPC systems with node sharing},
author = {Alex D Breslow and Ananta Tiwari and Martin Schulz and Laura Carrington and Lingjia Tang and Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/05/2503210.2503256.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the international conference on high performance computing, networking, storage and analysis},
pages = {1--12},
abstract = {Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10 to 20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location.
This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism -- a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners -- to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism -- a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners -- to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.
2012
Lingjia Tang; Jason Mars; Mary Lou Soffa
Compiling for niceness: Mitigating contention for qos in warehouse scale computers Proceedings Article
In: Proceedings of the Tenth International Symposium on Code Generation and Optimization, pp. 1–12, 2012.
@inproceedings{tang2012compiling,
title = {Compiling for niceness: Mitigating contention for qos in warehouse scale computers},
author = {Lingjia Tang and Jason Mars and Mary Lou Soffa},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/Compiling_for_niceness_Mitigating_contention_for_Q.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the Tenth International Symposium on Code Generation and Optimization},
pages = {1--12},
abstract = {As the class of datacenters recently coined as warehouse scale computers (WSCs) continues to leverage commodity multicore processors with increasing core counts, there is a growing need to consolidate various workloads on these machines to fully utilize their computation power. However, it is well known that when multiple applications are co-located on a multicore machine, contention for shared memory resources can cause severe cross-core performance interference. To ensure that the quality of service (QoS) of user-facing applications does not suffer from performance interference, WSC operators resort to disallowing co-location of latency-sensitive applications with other applications. This policy translates to low machine utilization and millions of dollars wasted in WSCs.
This paper presents QoS-Compile, the first compilation approach that statically manipulates application contentiousness to enable the co-location of applications with varying QoS requirements, and as a result, can greatly improve machine utilization. Our technique first pinpoints an application's code regions that tend to cause contention and performance interference. QoS-Compile then transforms those regions to reduce their contentious nature. In essence, to co-locate applications of different QoS priorities, our compilation technique uses pessimizing transformations to throttle down the memory access rate of the contentious regions in low priority applications to reduce their interference to high priority applications. Our evaluation using synthetic benchmarks, SPEC benchmarks and large-scale Google applications show that QoS-Compile can greatly reduce contention, improve QoS of applications, and improve machine utilization. Our experiments show that our technique improves applications' QoS performance by 21% and machine utilization by 36% on average.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper presents QoS-Compile, the first compilation approach that statically manipulates application contentiousness to enable the co-location of applications with varying QoS requirements, and as a result, can greatly improve machine utilization. Our technique first pinpoints an application's code regions that tend to cause contention and performance interference. QoS-Compile then transforms those regions to reduce their contentious nature. In essence, to co-locate applications of different QoS priorities, our compilation technique uses pessimizing transformations to throttle down the memory access rate of the contentious regions in low priority applications to reduce their interference to high priority applications. Our evaluation using synthetic benchmarks, SPEC benchmarks and large-scale Google applications show that QoS-Compile can greatly reduce contention, improve QoS of applications, and improve machine utilization. Our experiments show that our technique improves applications' QoS performance by 21% and machine utilization by 36% on average.
Jason Mars; Lingjia Tang; Kevin Skadron; Mary Lou Soffa; Robert Hundt
Increasing utilization in modern warehouse-scale computers using bubble-up Journal Article
In: IEEE Micro, vol. 32, no. 3, pp. 88–99, 2012.
@article{mars2012increasing,
title = {Increasing utilization in modern warehouse-scale computers using bubble-up},
author = {Jason Mars and Lingjia Tang and Kevin Skadron and Mary Lou Soffa and Robert Hundt},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/Increasing_Utilization_in_Modern_Warehouse-Scale_C.pdf},
year = {2012},
date = {2012-01-01},
journal = {IEEE Micro},
volume = {32},
number = {3},
pages = {88--99},
publisher = {IEEE},
abstract = {Precisely predicting performance degradation due to co-locating multiple executing applications on a single machine is critical for improving utilization in modern warehouse-scale computers (WSCs). Bubble-Up is the first mechanism for such precise prediction. As opposed to over-provisioning machines, Bubble-Up enables the safe colocation of multiple workloads on a single machine for Web service applications that have quality of service constraints, thus greatly improving machine utilization in modern WSCs.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Wei Wang; Tanima Dey; Jason Mars; Lingjia Tang; Jack W Davidson; Mary Lou Soffa
Performance analysis of thread mappings with a holistic view of the hardware resources Proceedings Article
In: 2012 IEEE International Symposium on Performance Analysis of Systems & Software, pp. 156–167, IEEE 2012.
@inproceedings{wang2012performance,
title = {Performance analysis of thread mappings with a holistic view of the hardware resources},
author = {Wei Wang and Tanima Dey and Jason Mars and Lingjia Tang and Jack W Davidson and Mary Lou Soffa},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/10.1.1.384.6879.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {2012 IEEE International Symposium on Performance Analysis of Systems & Software},
pages = {156--167},
organization = {IEEE},
abstract = {With the shift to chip multiprocessors, managing shared resources has become a critical issue in realizing their full potential. Previous research has shown that thread mapping is a powerful tool for resource management. However, the difficulty of simultaneously managing multiple hardware resources and the varying nature of the workloads have impeded the efficiency of thread mapping algorithms. To overcome the difficulties of simultaneously managing multiple resources with thread mapping, the interaction between various microarchitectural resources and thread characteristics must be well understood. This paper presents an in-depth analysis of PARSEC benchmarks running under different thread mappings to investigate the interaction of various thread mappings with microarchitectural resources including, L1 I/D-caches, I/D TLBs, L2 caches, hardware prefetchers, off-chip memory interconnects, branch predictors, memory disambiguation units and the cores. For each resource, the analysis provides guidelines for how to improve its utilization when mapping threads with different characteristics. We also analyze how the relative importance of the resources varies depending on the workloads. Our experiments show that when only memory resources are considered, thread mapping improves an application's performance by as much as 14% over the default Linux scheduler. In contrast, when both memory and processor resources are considered the mapping algorithm achieves performance improvements by as much as 28%. Additionally, we demonstrate that thread mapping should consider L2 caches, prefetchers and off-chip memory interconnects as one resource, and we present a new metric called L2-misses-memory-latency-product (L2MP) for evaluating their aggregated performance impact.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kristen Walcott-Justice; Jason Mars; Mary Lou Soffa
THeME: a system for testing by hardware monitoring events Proceedings Article
In: Proceedings of the 2012 International Symposium on Software Testing and Analysis, pp. 12–22, 2012.
@inproceedings{walcott2012theme,
title = {THeME: a system for testing by hardware monitoring events},
author = {Kristen Walcott-Justice and Jason Mars and Mary Lou Soffa},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/THeME_a_system_for_testing_by_hardware_m.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 2012 International Symposium on Software Testing and Analysis},
pages = {12--22},
abstract = {The overhead of test coverage analysis is dominated by monitoring the application, which is traditionally performed using instrumentation. However, instrumentation can prohibitively increase the time and especially the memory overhead of an application. As an alternative to instrumentation, we explore how recent hardware advances can be leveraged to improve the overheads of test coverage analysis. These hardware advances include hardware performance monitors and multicore technology.
In this work, we present our system, THeME, a testing framework that replaces instrumentation with hardware monitoring. THeME consists of a runtime system that takes advantage of hardware mechanisms and multiple cores and a static component to further extend the coverage derived from hardware event sampling. The results show that up to 90% of the actual coverage can be determined with less time overhead and negligible code growth compared to instrumentation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work, we present our system, THeME, a testing framework that replaces instrumentation with hardware monitoring. THeME consists of a runtime system that takes advantage of hardware mechanisms and multiple cores and a static component to further extend the coverage derived from hardware event sampling. The results show that up to 90% of the actual coverage can be determined with less time overhead and negligible code growth compared to instrumentation.
Jason Mars; Naveen Kumar
BlockChop: dynamic squash elimination for hybrid processor architecture Proceedings Article
In: 2012 39th Annual International Symposium on Computer Architecture (ISCA), pp. 536–547, IEEE 2012.
@inproceedings{mars2012blockchop,
title = {BlockChop: dynamic squash elimination for hybrid processor architecture},
author = {Jason Mars and Naveen Kumar},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/mars12isca.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {2012 39th Annual International Symposium on Computer Architecture (ISCA)},
pages = {536--547},
organization = {IEEE},
abstract = {Hybrid processors are HW/SW co-designed processors that leverage blocked-execution, the execution of regions of instructions as atomic blocks, to facilitate aggressive speculative optimization. As we move to a multicore hybrid design, fine grained conflicts for shared data can violate the atomicity requirement of these blocks and lead to expensive squashes and rollbacks. However, as these atomic regions differ from those used in checkpointing and transactional memory systems, the extent of this potentially prohibitive problem remains unclear, and mechanisms to mitigate these squashes dynamically may be critical to enable a highly performant multicore hybrid design. In this work, we investigate how multithreaded applications, both benchmark and commercial workloads, are affected by squashes, and present dynamic mechanisms for mitigating these squashes in hybrid processors. While the current wisdom is that there is not a significant number of squashes for smaller atomic regions, we observe this is not the case for many multithreaded workloads. With region sizes of just 200 - 500 instructions, we observe a performance degradation ranging from 10% to more than 50% for workloads with a mixture of shared reads and writes. By harnessing the unique flexibility provided by the software subsystem of hybrid processor design, we present BlockChop, a framework for dynamically mitigating squashes on multicore hybrid processors. We present a range of squash handling mechanisms leveraging retrials, interpretation, and retranslation, and find that BlockChop is quite effective. Over the current response to exceptions and squashes in a hybrid design, we are able to improve the performance of benchmark and commercial workloads by 1.4x and 1.2x on average for large and small region sizes respectively.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jason Mars
Rethinking the architecture of warehouse-scale computers PhD Thesis
2012.
@phdthesis{mars2012rethinking,
title = {Rethinking the architecture of warehouse-scale computers},
author = {Jason Mars},
url = {https://www.jasonmars.org/wp-content/uploads/2020/12/10.1.1.298.5501.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Citeseer},
abstract = {As the world’s computation continues to move into the massive datacenter infrastructures recently coined as “warehouse-scale computers” (WSCs), developing highly efficient systems for these computing platforms has become increasingly critical.
The architecture of modern WSCs remains in its relative infancy. In designing modern WSCs, architects start with commodity off-the-shelf components including commodity pro- cessors and open source system software components. These components are then stitched together to design a simple and cost effective WSC. While this approach has been effective for producing systems that are functional and can scale the delivery of web-services as de- mand increases, efficiency has suffered. The commodity components and system software used have not been designed and refined with the unique characteristics of WSCs in mind, and these characteristics may be critical for a highly efficient WSC design. As such, we must rethink the architecture of modern WSCs.
This dissertation argues that one such characteristic has been overlooked: the diversity in execution environments in modern WSCs. We define a given task’s execution environ- ment as the coupling of the machine configuration and the co-running tasks simultaneously executing alongside the given task. At any given time in a WSC, we have a high degree of diversity across these execution environments. This dissertation argues that acknowledging, exploiting, and adapting to the diversity in execution environments are critical for the de- sign of a highly efficient WSC. When ignoring this diversity, three critical design problems arise, including 1) the homogeneous assumption, where all machines and cores in a WSC are assumed to be equal and managed accordingly, 2) the rigidness of applications, where application binaries can not adapt to changes across and within execution environments, and 3) the oblivion of interference, where interference between tasks within an execution environment can not be measured or managed.
This dissertation addresses each of these three design problems. First, we address the
homogeneous assumption at the cluster level by redesigning the task manager in the WSC to learn which execution environments tasks prefer and map them accordingly. Second, we address the rigidness of applications at the machine level by providing a mechanism to allow applications to adapt to their execution environment, then leverage this mechanism to solve pressing problems in WSCs. Lastly, we address the oblivion of interference at both the cluster and machine levels by providing novel metrics and techniques for measuring and managing interference to improve the utilization of WSCs.
By incorporating an awareness of the diversity in execution environments in these three key design areas, we produce a WSC design that is significantly more efficient in both the performance of the applications that live in this domain and the utilization of compute resources in the WSC. By improving efficiency for these two metrics, we effectively require a smaller WSC for some fixed workload, which has implications on reducing not only the cost of these systems, but also their environmental footprint.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
The architecture of modern WSCs remains in its relative infancy. In designing modern WSCs, architects start with commodity off-the-shelf components including commodity pro- cessors and open source system software components. These components are then stitched together to design a simple and cost effective WSC. While this approach has been effective for producing systems that are functional and can scale the delivery of web-services as de- mand increases, efficiency has suffered. The commodity components and system software used have not been designed and refined with the unique characteristics of WSCs in mind, and these characteristics may be critical for a highly efficient WSC design. As such, we must rethink the architecture of modern WSCs.
This dissertation argues that one such characteristic has been overlooked: the diversity in execution environments in modern WSCs. We define a given task’s execution environ- ment as the coupling of the machine configuration and the co-running tasks simultaneously executing alongside the given task. At any given time in a WSC, we have a high degree of diversity across these execution environments. This dissertation argues that acknowledging, exploiting, and adapting to the diversity in execution environments are critical for the de- sign of a highly efficient WSC. When ignoring this diversity, three critical design problems arise, including 1) the homogeneous assumption, where all machines and cores in a WSC are assumed to be equal and managed accordingly, 2) the rigidness of applications, where application binaries can not adapt to changes across and within execution environments, and 3) the oblivion of interference, where interference between tasks within an execution environment can not be measured or managed.
This dissertation addresses each of these three design problems. First, we address the
homogeneous assumption at the cluster level by redesigning the task manager in the WSC to learn which execution environments tasks prefer and map them accordingly. Second, we address the rigidness of applications at the machine level by providing a mechanism to allow applications to adapt to their execution environment, then leverage this mechanism to solve pressing problems in WSCs. Lastly, we address the oblivion of interference at both the cluster and machine levels by providing novel metrics and techniques for measuring and managing interference to improve the utilization of WSCs.
By incorporating an awareness of the diversity in execution environments in these three key design areas, we produce a WSC design that is significantly more efficient in both the performance of the applications that live in this domain and the utilization of compute resources in the WSC. By improving efficiency for these two metrics, we effectively require a smaller WSC for some fixed workload, which has implications on reducing not only the cost of these systems, but also their environmental footprint.
2011
Lingjia Tang; Jason Mars; Neil Vachharajani; Robert Hundt; Mary Lou Soffa
The impact of memory subsystem resource sharing on datacenter applications Proceedings Article
In: 2011 38th Annual International Symposium on Computer Architecture (ISCA), pp. 283–294, IEEE 2011.
@inproceedings{tang2011impact,
title = {The impact of memory subsystem resource sharing on datacenter applications},
author = {Lingjia Tang and Jason Mars and Neil Vachharajani and Robert Hundt and Mary Lou Soffa},
year = {2011},
date = {2011-01-01},
booktitle = {2011 38th Annual International Symposium on Computer Architecture (ISCA)},
pages = {283--294},
organization = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jason Mars; Lingjia Tang; Robert Hundt; Kevin Skadron; Mary Lou Soffa
Bubble-up: Increasing utilization in modern warehouse scale computers via sensible co-locations Proceedings Article
In: Proceedings of the 44th annual IEEE/ACM International Symposium on Microarchitecture, pp. 248–259, 2011.
@inproceedings{mars2011bubble,
title = {Bubble-up: Increasing utilization in modern warehouse scale computers via sensible co-locations},
author = {Jason Mars and Lingjia Tang and Robert Hundt and Kevin Skadron and Mary Lou Soffa},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 44th annual IEEE/ACM International Symposium on Microarchitecture},
pages = {248--259},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jason Mars; Lingjia Tang; Mary Lou Soffa
Directly characterizing cross core interference through contention synthesis Proceedings Article
In: Proceedings of the 6th International Conference on High Performance and Embedded Architectures and Compilers, pp. 167–176, 2011.
@inproceedings{mars2011directly,
title = {Directly characterizing cross core interference through contention synthesis},
author = {Jason Mars and Lingjia Tang and Mary Lou Soffa},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 6th International Conference on High Performance and Embedded Architectures and Compilers},
pages = {167--176},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}