Plotting TSA Checkpoint Passenger Numbers in Matplotlib

2022-06-24

At the start of the Covid-19 outbreak, the U.S. Transportation Security Administration began sharing TSA security checkpoint passenger throughput numbers.

Below are several plots created with Matplotlib showing the trend of TSA passengers through the pandemic. The code for these charts is stored on GitHub here.

      
        import pandas as pd
        import matplotlib.pyplot as plt
        from matplotlib import cm
        
        
        def plot_trend(width=10, height=6, show_plot=False):
            '''Plot complete trend of data.'''
        
            df = pd.read_csv('data/tsa.csv')
            df['Date'] = pd.to_datetime(df['Date'])
            latest_date = max(df['Date']).strftime('%Y-%m-%d')
        
            # Plot data
            plt.figure(figsize=(width, height))
            plt.plot(df['Date'], df['Passengers'], color='darkslategray')
            plt.suptitle('U.S. TSA Checkpoint Passengers',
                         x=0.01,
                         y=0.98,
                         fontweight='bold',
                         fontsize='x-large',
                         horizontalalignment='left')
            plt.figtext(0.99, 0.01, 'Data through ' + latest_date,
                        ha = 'right',
                        fontstyle='italic')
            # Convert axis labels to commas and remove scientific notation
            current_values = plt.gca().get_yticks()
            plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_values])
        
            # Highlight latest data point
            latest_number = df[df['Date'] == latest_date]['Passengers']
            latest_number = int(latest_number.iloc[0])
            plt.scatter(pd.to_datetime(latest_date),
                        latest_number,
                        color='goldenrod',
                        alpha=0.7,
                        s=70)
            plt.annotate("{:,}".format(latest_number),
                         xy=(pd.to_datetime(latest_date), latest_number * 1.02),
                         horizontalalignment='center'
                        )
        
            plt.savefig('images/tsa-full-trend.png')
            if show_plot:
                plt.show()
            print('Trend chart created.')
      
    
Trend plot of TSA checkpoint passengers.
      
        def plot_trend_by_year(width=10, height=6, show_plot=False):
          '''Plot trend of data by year.'''
          # Get latest date
          df = pd.read_csv('data/tsa.csv')
          df['Date'] = pd.to_datetime(df['Date'])
          latest_date = max(df['Date']).strftime('%Y-%m-%d')
        
          df = pd.read_csv('data/tsa-by-year.csv')
          df['Date'] = pd.to_datetime(df['Date'])
      
          # Get year labels from columns
          years = df.columns[1:].sort_values()
          num_years = len(years)
      
          # Create colors for each year
          cmap = cm.get_cmap('Blues')
          colors = [0.2, 0.4, 0.6, 0.95]
      
          plt.figure(figsize=(width, height))
          for i in range(num_years):
              plt.plot(df['Date'],
                      df[years[i]],
                      label=years[i],
                      color=cmap(colors[i])
                      )
          plt.legend()
          plt.suptitle('U.S. TSA Checkpoint Passengers by Year',
                      x=0.01,
                      y=0.98,
                      fontweight='bold',
                      fontsize='x-large',
                      horizontalalignment='left')
          plt.figtext(0.99, 0.01, 'Data through ' + latest_date,
                      ha = 'right',
                      fontstyle='italic')
          # Highlight latest data point
          latest_number = df[df['Date'] == latest_date]['2022']
          latest_number = int(latest_number.iloc[0])
          plt.scatter(pd.to_datetime(latest_date),
                      latest_number,
                      color='goldenrod',
                      alpha=0.7,
                      s=70,
                      zorder=2.5)
          plt.annotate("{:,}".format(latest_number),
                      xy=(pd.to_datetime(latest_date), latest_number * 1.02),
                      horizontalalignment='center'
                      )
      
          # Convert axis labels to commas and remove scientific notation
          current_values = plt.gca().get_yticks()
          plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_values])
          
          plt.savefig('images/tsa-by-year.png')
          if show_plot:
              plt.show()
          print('Trend chart by year created.')    
      
    
Trend plot of TSA checkpoint passengers by year.
      
        def plot_percent_change_trend_by_year(width=10, height=6, show_plot=False):
            '''Plot percent change trend by year.'''
            # Get latest date
            df = pd.read_csv('data/tsa.csv')
            df['Date'] = pd.to_datetime(df['Date'])
            latest_date = max(df['Date']).strftime('%Y-%m-%d')
            
            # Read in data by year
            df = pd.read_csv('data/tsa-by-year.csv')
            df['Date'] = pd.to_datetime(df['Date'])
        
            # Calculate year-over-year percent change
            df['2022_Pct_Chg'] = (df['2022'] / df['2021']) - 1
            df['2021_Pct_Chg'] = (df['2021'] / df['2020']) - 1
            df['2020_Pct_Chg'] = (df['2020'] / df['2019']) - 1
        
            df = df[['Date', '2022_Pct_Chg', '2021_Pct_Chg', '2020_Pct_Chg']]
        
            # Get year labels from columns
            years = df.columns[1:].sort_values()
            num_years = len(years)
        
            # Create colors for each year
            cmap = cm.get_cmap('Greens')
            colors = [0.3, 0.5, 0.95]
        
            plt.figure(figsize=(width, height))
            for i in range(num_years):
                plt.plot(df['Date'],
                        df[years[i]],
                        label=years[i],
                        color=cmap(colors[i])
                        )
            plt.legend()
            plt.suptitle('U.S. TSA Checkpoint Passengers Percent Change by Year',
                        x=0.01,
                        y=0.98,
                        fontweight='bold',
                        fontsize='x-large',
                        horizontalalignment='left')
            plt.figtext(0.99, 0.01, 'Data through ' + latest_date,
                        ha = 'right',
                        fontstyle='italic')
                        
            # Add line at zero
            plt.axhline(y=0, color='black', alpha=0.25, linewidth=0.7)
        
            # Highlight latest data point
            latest_pct_chg = df[df['Date'] == latest_date]['2022_Pct_Chg']
            latest_pct_chg = latest_pct_chg.iloc[0]
            plt.scatter(pd.to_datetime(latest_date),
                        latest_pct_chg,
                        color='goldenrod',
                        alpha=0.7,
                        s=70)
            plt.annotate("{:.1%}".format(latest_pct_chg),
                        xy=(pd.to_datetime(latest_date), latest_pct_chg * 1.2)
                        )
        
            # Convert axis labels to commas and remove scientific notation
            current_values = plt.gca().get_yticks()
            plt.gca().set_yticklabels(['{:.0%}'.format(x) for x in current_values])
            
            plt.savefig('images/tsa-pct-chg-by-year.png')
            if show_plot:
                plt.show()
            print('Percent change trend chart by year created.')
      
    
Trend plot of percent change in TSA checkpoint passengers by year.
      
        def plot_percent_of_2019_by_year(width=10, height=6, show_plot=False):
            '''Plot percent of 2019 baseline by year.'''
            # Get latest date
            df = pd.read_csv('data/tsa.csv')
            df['Date'] = pd.to_datetime(df['Date'])
            latest_date = max(df['Date']).strftime('%Y-%m-%d')
            
            # Read in data by year
            df = pd.read_csv('data/tsa-by-year.csv')
            df['Date'] = pd.to_datetime(df['Date'])
        
            # Calculate percent of 2019 for each year
            df['2022_pct_of_2019'] = df['2022'] / df['2019']
            df['2021_pct_of_2019'] = df['2021'] / df['2019']
            df['2020_pct_of_2019'] = df['2020'] / df['2019']
        
            df = df[['Date', '2022_pct_of_2019', '2021_pct_of_2019', '2020_pct_of_2019']]
        
            # Get year labels from columns
            years = df.columns[1:].sort_values()
            num_years = len(years)
        
            # Create colors for each year
            cmap = cm.get_cmap('Oranges')
            colors = [0.3, 0.5, 0.95]
        
            plt.figure(figsize=(width, height))
            for i in range(num_years):
                plt.plot(df['Date'],
                        df[years[i]],
                        label=years[i],
                        color=cmap(colors[i])
                        )
            plt.legend()
            plt.suptitle('U.S. TSA Checkpoint Passengers Percent of 2019 Baseline',
                        x=0.01,
                        y=0.98,
                        fontweight='bold',
                        fontsize='x-large',
                        horizontalalignment='left')
            plt.figtext(0.99, 0.01, 'Data through ' + latest_date,
                        ha = 'right',
                        fontstyle='italic')
        
            # Add line at 100%
            # plt.axhline(y=0, color='black', alpha=0.25, linewidth=0.7)
            plt.axhline(y=1, color='black', alpha=0.25, linewidth=0.7)
        
            # Highlight latest data point
            latest_pct_chg = df[df['Date'] == latest_date]['2022_pct_of_2019']
            latest_pct_chg = latest_pct_chg.iloc[0]
            plt.scatter(pd.to_datetime(latest_date),
                        latest_pct_chg,
                        color='navy',
                        alpha=0.5,
                        s=70,
                        zorder=2.5)
            plt.annotate("{:.1%}".format(latest_pct_chg),
                        xy=(pd.to_datetime(latest_date), latest_pct_chg * 1.02)
                        )
        
            plt.grid(visible=True, axis='y', alpha=0.25)
        
            # Convert axis labels to commas and remove scientific notation
            current_values = plt.gca().get_yticks()
            plt.gca().set_yticklabels(['{:.0%}'.format(x) for x in current_values])
            
            plt.savefig('images/tsa-pct-of-2019.png')
            if show_plot:
                plt.show()
            print('Percent of 2019 trend chart by year created.')
      
    
Trend plot of TSA checkpoint passengers as percent of 2019 by year.
      
        def plot_weekly_midweek_trend_by_year(width=10, height=6, show_plot=False):
            '''Plot weekly Monday - Wednesday trend of TSA checkpoint passnegers by year.
            Monday - Wednesday are peak business travel times, so this gives a rough 
            approximation of business travel trends.'''
        
            # Get latest date
            df = pd.read_csv('data/tsa.csv')
            df['Date'] = pd.to_datetime(df['Date'])
            latest_date = max(df['Date']).strftime('%Y-%m-%d')
            latest_week_number = pd.to_datetime(latest_date).week
            
            # Check if we have a full week of new data, if not then we'll need to delete
            # any recent days until we have a full week of data.
            if pd.to_datetime(latest_date).dayofweek == 6:
                full_week_of_data = True
            else:
                full_week_of_data = False
        
            # Read in data by year, create week number and day-of-week fields
            df = pd.read_csv('data/tsa-by-year.csv')
            df['Date'] = pd.to_datetime(df['Date'])
            df['Week'] = df['Date'].dt.week
            df['DOW'] = df['Date'].dt.dayofweek
            
            # If we don't have a full week of new data, then delete the partial week of 
            # data for the current week.
            if full_week_of_data == False:
                df.loc[df['Week'] == latest_week_number, '2022'] = np.nan
        
            # Adjusted latest_date to the most recent end-of-week date for which we have
            # complete weekly data.
            latest_date = max(df.loc[df['2022'] >= 0, 'Date']).strftime('%Y-%m-%d')
        
            # Filter for only Mon - Wed dates. Monday = 0, Tuesday = 1, etc.
            # https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.dayofweek.html
            df = df[df['DOW'].isin([0, 1, 2])]
            df = df.drop(columns=['DOW', 'Date'])
            df = df.groupby(by='Week').sum()
            df = df.reset_index()
        
            # When doing the groupby and sum, any future 2022 values become 0, so we 
            # need to reset then to NaN.
            df.loc[df['2022'] == 0, '2022'] = np.nan
        
            # Get year labels from columns
            years = df.columns[1:].sort_values()
            num_years = len(years)
        
            # Create colors for each year
            cmap = cm.get_cmap('Purples')
            colors = [0.2, 0.4, 0.6, 0.95]
        
            plt.figure(figsize=(width, height))
            for i in range(num_years):
                plt.plot(df['Week'],
                        df[years[i]],
                        label=years[i],
                        color=cmap(colors[i])
                        )
            plt.legend()
            plt.xlabel('Week Number')
            plt.suptitle('U.S. Weekly Monday - Wednesday TSA Checkpoint Passengers',
                        x=0.01,
                        y=0.98,
                        fontweight='bold',
                        fontsize='x-large',
                        horizontalalignment='left')
            plt.figtext(0.99, 0.01, 'Data through ' + latest_date,
                        ha = 'right',
                        fontstyle='italic')
            plt.figtext(0.01, 0.01, 'Approximating midweek business travel by looking at Mon-Wed.',
                        ha = 'left',
                        fontstyle='italic')
        
            # Highlight latest data point
            if full_week_of_data == False:
                latest_week_number = latest_week_number - 1
        
            latest_number = df[df['Week'] == latest_week_number]['2022']
            latest_number = int(latest_number.iloc[0])
            plt.scatter(latest_week_number,
                        latest_number,
                        color='goldenrod',
                        alpha=0.7,
                        s=70,
                        zorder=2.5)
            plt.annotate("{:,}".format(latest_number),
                        xy=(latest_week_number, latest_number * 1.02),
                        horizontalalignment='center'
                        )
        
            # Convert axis labels to commas and remove scientific notation
            current_values = plt.gca().get_yticks()
            plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_values])
            
            plt.savefig('images/tsa-midweek-by-year.png')
            if show_plot:
                plt.show()
            print('Midweek business travel trend chart by year created.')
      
    
Trend plot of weekly Mon-Wed TSA checkpoint passengers by year.