import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

export const _frontmatter = {
  "path": "/blog/the-power-of-95-and-p99",
  "date": "15th July 2024",
  "title": "The Power of P95 and P99",
  "summary": "We are proud to unveil our new dashboards that focus on key application performance indicators!",
  "author": "Nick Schuch",
  "tag": "Development",
  "tagColor": "blue",
  "tags": [{
    "name": "announcement"
  }, {
    "name": "dashboards"
  }]
};
const layoutProps = {
  _frontmatter
};
const MDXLayout = "wrapper";
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">
    <p>{`We're proud to unveil our new dashboards, focusing on key application performance indicators!`}</p>
    <p>{`So what led us here? Let's look back at our early assumptions, what we found out along the way, and the changes we've made to address those findings.`}</p>
    <h2>{`Our early assumptions`}</h2>
    <p>{`When we first developed our application dashboards, we strongly focused on cache HIT ratios. The idea was that if we had high cache HIT ratios then the application was scalable and highly performant. `}</p>
    <p>{`We were so invested in cache HIT ratios that we put all the CDN (CloudFront) metrics at the top of the dashboard.`}</p>
    <p><img parentName="p" {...{
        "src": "/images/posts/the-power-of-p95-and-p99/dashboard-by-service.jpg",
        "alt": "Diagram of the original dashboard listing services top to bottom of a request"
      }}></img></p>
    <p>{`High cache HIT ratios are a characteristic of highly performant applications but not the only characteristic.`}</p>
    <h2>{`Our findings`}</h2>
    <p>{`Our dashboards focused so much on caching that a high cache HIT ratio was perceived as the end of the quest towards performance.`}</p>
    <p>{`In reality, our dashboards were hiding crucial indicators that needed to be addressed.`}</p>
    <ul>
      <li parentName="ul">{`Response times`}</li>
      <li parentName="ul">{`HTTP response codes eg. (2xx/4xx/5xx)`}</li>
    </ul>
    <p><img parentName="p" {...{
        "src": "/images/posts/the-power-of-p95-and-p99/hiding-in-plain-sight.jpg",
        "alt": "Diagram showing two key metrics in the middle of the dashboard"
      }}></img></p>
    <p>{`While caching is important, development teams should be focusing on:`}</p>
    <ul>
      <li parentName="ul">{`How slow is my application?`}</li>
      <li parentName="ul">{`Are there any errors?`}</li>
    </ul>
    <p>{`It wasn’t until we started publishing P95 and P99 percentiles to our response times that we discovered the true value of these metrics.`}</p>
    <h2>{`What are P95 and P99 percentiles?`}</h2>
    <p>{`P95 and P99 percentiles are specific points in a dataset that help us understand its distribution:`}</p>
    <ul>
      <li parentName="ul"><strong parentName="li">{`P95`}</strong>{` - This is the value below which 95% of the data points fall. It shows where most values lie, excluding the top 5% of the highest values.`}</li>
      <li parentName="ul"><strong parentName="li">{`P99`}</strong>{` - This is the value below which 99% of the data points fall. It gives an even higher threshold, showing where almost all values lie, except for the top 1% highest values.`}</li>
    </ul>
    <p>{`These percentiles are useful because they indicate how extreme or typical values are in a dataset, helping to understand its overall spread and potential outliers. The key takeaway here is potential outliers.`}</p>
    <p>{`Below is a demonstration of our graphs as we turn on P95 and P99 percentiles.`}</p>
    <p><img parentName="p" {...{
        "src": "/images/posts/the-power-of-p95-and-p99/percentiles-example.jpg",
        "alt": "Diagram demonstrating P95 and P99 percentiles vs average"
      }}></img></p>
    <p>{`We discovered that our average was hiding a lot of outliers. By enabling P95 and P99 percentiles, we saw that the little spike in average was actually smoothed out, when we should have been debugging it.`}</p>
    <p>{`We needed a dashboard refresh.`}</p>
    <h2>{`What changes did we make to our dashboards?`}</h2>
    <p>{`The first update we made was the feng shui of the dashboard. Previously, the dashboard emphasised “the flow of a request”, displaying metrics from the edge at the top and then subsequent metrics as the request passed through the system, e.g. CloudFront to load balancer to application containers.`}</p>
    <p><img parentName="p" {...{
        "src": "/images/posts/the-power-of-p95-and-p99/flow.jpg",
        "alt": "Diagram showing our dashboard layout before and after"
      }}></img></p>
    <p>{`We decided to shuffle these metrics, placing the response times and HTTP response codes at the top—calling them Key Application Performance Indicators. This is us planting our flag and declaring that these are the most important metrics development teams should be reviewing.`}</p>
    <p><img parentName="p" {...{
        "src": "/images/posts/the-power-of-p95-and-p99/placement.jpg",
        "alt": "Diagram showing our dashboard layout before and after"
      }}></img></p>
    <p>{`We also took this opportunity to add slow database queries to our dashboards. These slow queries show developers a list of MySQL queries that took longer than a specified threshold. These query events can then be correlated back to our response times to determine if a slow database call caused a specific spike.`}</p>
    <p>{`Finally, we also split out the cron logs from the application logs. This way, development teams can easily understand if a log is coming from a request or a long-running background task.`}</p>
    <h2>{`Summary`}</h2>
    <p>{`This hierarchy is the new default for our dashboards, which we plan to build upon, exposing further critical metrics.`}</p>
    <p>{`Future improvements will include:`}</p>
    <ul>
      <li parentName="ul">{`Anomaly detection for events that affect key performance indicators`}</li>
      <li parentName="ul">{`OpenTelemetry traces`}</li>
      <li parentName="ul">{`Application Performance Monitoring data`}</li>
    </ul>
    <p>{`We’d love to hear your feedback, including how these new dashboards have helped you improve the performance of your applications!`}</p>

    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      