{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"}},"nbformat":4,"nbformat_minor":2,"cells":[{"cell_type":"markdown","source":["# apply\n","\n","<div style=\"position: relative; padding-bottom: 62.5%; height: 0;\"><iframe src=\"https://www.loom.com/embed/e414888ad84741da80efa69997a62574\" frameborder=\"0\" webkitallowfullscreen mozallowfullscreen allowfullscreen style=\"position: absolute; top: 0; left: 0; width: 100%; height: 100%;\"></iframe></div>\n","\n","```{jupyter-info}\n","{rel-data-download}`earthquakes.csv`\n","```"],"metadata":{}},{"cell_type":"code","execution_count":1,"source":["import pandas as pd"],"outputs":[],"metadata":{}},{"cell_type":"code","execution_count":5,"source":["df = pd.read_csv('earthquakes.csv')\n","df"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["              id  year  month  day   latitude   longitude        name  \\\n","0     nc72666881  2016      7   27  37.672333 -121.619000  California   \n","1     us20006i0y  2016      7   27  21.514600   94.572100       Burma   \n","2     nc72666891  2016      7   27  37.576500 -118.859167  California   \n","3     nc72666896  2016      7   27  37.595833 -118.994833  California   \n","4     nn00553447  2016      7   27  39.377500 -119.845000      Nevada   \n","...          ...   ...    ...  ...        ...         ...         ...   \n","8389  nc72685246  2016      8   25  36.515499 -121.099831  California   \n","8390  ak13879193  2016      8   25  61.498400 -149.862700      Alaska   \n","8391  nc72685251  2016      8   25  38.805000 -122.821503  California   \n","8392  ci37672328  2016      8   25  34.308000 -118.635333  California   \n","8393  ci37672360  2016      8   25  34.119167 -116.933667  California   \n","\n","      magnitude  \n","0          1.43  \n","1          4.90  \n","2          0.06  \n","3          0.40  \n","4          0.30  \n","...         ...  \n","8389       2.42  \n","8390       1.40  \n","8391       1.06  \n","8392       1.55  \n","8393       0.89  \n","\n","[8394 rows x 8 columns]"],"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>year</th>\n","      <th>month</th>\n","      <th>day</th>\n","      <th>latitude</th>\n","      <th>longitude</th>\n","      <th>name</th>\n","      <th>magnitude</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>nc72666881</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>37.672333</td>\n","      <td>-121.619000</td>\n","      <td>California</td>\n","      <td>1.43</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>us20006i0y</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>21.514600</td>\n","      <td>94.572100</td>\n","      <td>Burma</td>\n","      <td>4.90</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>nc72666891</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>37.576500</td>\n","      <td>-118.859167</td>\n","      <td>California</td>\n","      <td>0.06</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>nc72666896</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>37.595833</td>\n","      <td>-118.994833</td>\n","      <td>California</td>\n","      <td>0.40</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>nn00553447</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>39.377500</td>\n","      <td>-119.845000</td>\n","      <td>Nevada</td>\n","      <td>0.30</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>8389</th>\n","      <td>nc72685246</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>36.515499</td>\n","      <td>-121.099831</td>\n","      <td>California</td>\n","      <td>2.42</td>\n","    </tr>\n","    <tr>\n","      <th>8390</th>\n","      <td>ak13879193</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>61.498400</td>\n","      <td>-149.862700</td>\n","      <td>Alaska</td>\n","      <td>1.40</td>\n","    </tr>\n","    <tr>\n","      <th>8391</th>\n","      <td>nc72685251</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>38.805000</td>\n","      <td>-122.821503</td>\n","      <td>California</td>\n","      <td>1.06</td>\n","    </tr>\n","    <tr>\n","      <th>8392</th>\n","      <td>ci37672328</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>34.308000</td>\n","      <td>-118.635333</td>\n","      <td>California</td>\n","      <td>1.55</td>\n","    </tr>\n","    <tr>\n","      <th>8393</th>\n","      <td>ci37672360</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>34.119167</td>\n","      <td>-116.933667</td>\n","      <td>California</td>\n","      <td>0.89</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>8394 rows × 8 columns</p>\n","</div>"]},"metadata":{},"execution_count":5}],"metadata":{}},{"cell_type":"markdown","source":["Last time, we learned that we can use regular arithmetic operators on `pandas` `DataFrames` or `Series` to transform them. For example, if we were working with the earthquakes data, we could multiply each magnitude by 2 using the following syntax to do an element-wise computation."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":6,"source":["df['magnitude'] * 2"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["0       2.86\n","1       9.80\n","2       0.12\n","3       0.80\n","4       0.60\n","        ... \n","8389    4.84\n","8390    2.80\n","8391    2.12\n","8392    3.10\n","8393    1.78\n","Name: magnitude, Length: 8394, dtype: float64"]},"metadata":{},"execution_count":6}],"metadata":{}},{"cell_type":"markdown","source":["What if we wanted to find the length of each value in the name column? You might try something like the following and hope it does an element-wise computation as well."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":7,"source":["len(df['name'])"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["8394"]},"metadata":{},"execution_count":7}],"metadata":{}},{"cell_type":"markdown","source":["That doesn't look right... Last time we saw you can use the `len` function to find the number of elements in a structure, so this is actually returning the number of elements in the `Series` `df['name']`! \n","\n","For the most part, you can only do element-wise operations with:\n","* Arithmetic operators (e.g., `+`, `-`, `*`, etc.)\n","* Comparison operators (e.g., `==`, `<`, etc.)\n","* Logical operators (`&`, `|`, `~`)\n","\n","This means anything else will act on the `Series` itself, just like this `len` function did!\n","\n","## Built-in Functions in `pandas`\n","The syntax looks a bit weird at first, but if you want to call the `len` function on each `str`, you have to use this syntax below."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":8,"source":["df['name'].str.len()"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["0       10\n","1        5\n","2       10\n","3       10\n","4        6\n","        ..\n","8389    10\n","8390     6\n","8391    10\n","8392    10\n","8393    10\n","Name: name, Length: 8394, dtype: int64"]},"metadata":{},"execution_count":8}],"metadata":{}},{"cell_type":"markdown","source":["This reads \"Take the name column, and apply the `len` function defined for `str`s to each element in the `Series`\". It looks really odd at first, but it's actually a nice syntax because it lets you be explicit what type you want to treat the data as and which function to call on it! We won't look at other types now but there is a similar syntax for those as well.\n","\n","Now you aren't limited to just calling `len` here, you can call pretty much any `str` function using this syntax. For example, the following cell shows how to convert each name to its upper-case version."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":10,"source":["df['name'].str.upper()"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["0       CALIFORNIA\n","1            BURMA\n","2       CALIFORNIA\n","3       CALIFORNIA\n","4           NEVADA\n","           ...    \n","8389    CALIFORNIA\n","8390        ALASKA\n","8391    CALIFORNIA\n","8392    CALIFORNIA\n","8393    CALIFORNIA\n","Name: name, Length: 8394, dtype: object"]},"metadata":{},"execution_count":10}],"metadata":{}},{"cell_type":"markdown","source":["Do note that this does not modify the original name column, but rather returns a new `Series` with all the names upper-cased.\n","\n","## Apply\n","What if you wanted to write your own function to transform a value and apply it to each element in a `Series`? For example, what if I wanted to grab the first two characters from each name? \n","\n","This is where we will need the more general `apply` function defined for `pandas` objects. `apply` is more general than using the specific `str` functions we saw above since it will let you use almost any function for your data transfomration. \n","\n","Before we show how to do the specific example of grabbing the first two characters from the names, let's use this new approach to find the `len` of each name. We first show how to do this, and then explain what is happening."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":11,"source":["df['name'].apply(len)"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["0       10\n","1        5\n","2       10\n","3       10\n","4        6\n","        ..\n","8389    10\n","8390     6\n","8391    10\n","8392    10\n","8393    10\n","Name: name, Length: 8394, dtype: int64"]},"metadata":{},"execution_count":11}],"metadata":{}},{"cell_type":"markdown","source":["The first part, `df['name'].apply(`, should probably make some sense to you. We are calling some function named `apply` on the `Series` `df['name']`. What's very strange about this is it seems to be passing `len` as a parameter to this `apply` function!!! \n","\n","While this does look very strange, this is totally allowed in Python. A function is, in some sense, just like any other value in Python. In fact, the name of a function is treated the same as any variable name! \n","\n","So the authors of `pandas` who wrote the `apply` function, wrote it to take a parameter that is ANOTHER function. They then call that function on each element in the `Series`. \n","\n","The cell below implements something sort of like this behavior but using `list`s instead. "],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":12,"source":["def list_apply(values, fun):\n","    \"\"\"\n","    Takes a list of values and a function, and applies that function\n","    to each value in values. The given function must take one parameter\n","    as input and the returned list will be the result of calling that\n","    function once for each value in the list.\n","    \"\"\"\n","    # It's not necessary to use a list comprehension here, \n","    # but it's the easiest way to write this method!\n","    return [fun(v) for v in values]\n","\n","list_apply(['I', 'love', 'dogs'], len)"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["[1, 4, 4]"]},"metadata":{},"execution_count":12}],"metadata":{}},{"cell_type":"markdown","source":["There is no restriction to only passing in the `len` function as a parameter here. You can pass any function that takes a single argument. \n","\n","In the cell below, we will define a new function `first_two` that takes a `str` and returns the first two characters and then pass that to `apply`."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":14,"source":["def first_two(s):\n","    \"\"\"\n","    Returns the first two characters of the given str as a str.\n","    \n","    Assumes there are at least two characters in s.\n","    \"\"\"\n","    return s[:2]\n","\n","df['name'].apply(first_two)"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["0       Ca\n","1       Bu\n","2       Ca\n","3       Ca\n","4       Ne\n","        ..\n","8389    Ca\n","8390    Al\n","8391    Ca\n","8392    Ca\n","8393    Ca\n","Name: name, Length: 8394, dtype: object"]},"metadata":{},"execution_count":14}],"metadata":{}},{"cell_type":"markdown","source":["## Saving Results\n","Remember this `apply` function doesn't modify any data in the `DataFrame` or `Series`, but rather returns a new one. It's common that you want to save the result of an `apply` to your dataset to use those values later. Just like how you can use the `[]` syntax to select columns from a `DataFrame`, you can use it to set columns in a `DataFrame`. \n","\n","Below, we create a new column in the dataset by assigning to the new column name. Notice that `df` now has this extra column."],"metadata":{},"attachments":{}},{"cell_type":"code","execution_count":15,"source":["df['first_two_letters'] = df['name'].apply(first_two)\n","df"],"outputs":[{"output_type":"execute_result","data":{"text/plain":["              id  year  month  day   latitude   longitude        name  \\\n","0     nc72666881  2016      7   27  37.672333 -121.619000  California   \n","1     us20006i0y  2016      7   27  21.514600   94.572100       Burma   \n","2     nc72666891  2016      7   27  37.576500 -118.859167  California   \n","3     nc72666896  2016      7   27  37.595833 -118.994833  California   \n","4     nn00553447  2016      7   27  39.377500 -119.845000      Nevada   \n","...          ...   ...    ...  ...        ...         ...         ...   \n","8389  nc72685246  2016      8   25  36.515499 -121.099831  California   \n","8390  ak13879193  2016      8   25  61.498400 -149.862700      Alaska   \n","8391  nc72685251  2016      8   25  38.805000 -122.821503  California   \n","8392  ci37672328  2016      8   25  34.308000 -118.635333  California   \n","8393  ci37672360  2016      8   25  34.119167 -116.933667  California   \n","\n","      magnitude first_two_letters  \n","0          1.43                Ca  \n","1          4.90                Bu  \n","2          0.06                Ca  \n","3          0.40                Ca  \n","4          0.30                Ne  \n","...         ...               ...  \n","8389       2.42                Ca  \n","8390       1.40                Al  \n","8391       1.06                Ca  \n","8392       1.55                Ca  \n","8393       0.89                Ca  \n","\n","[8394 rows x 9 columns]"],"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>year</th>\n","      <th>month</th>\n","      <th>day</th>\n","      <th>latitude</th>\n","      <th>longitude</th>\n","      <th>name</th>\n","      <th>magnitude</th>\n","      <th>first_two_letters</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>nc72666881</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>37.672333</td>\n","      <td>-121.619000</td>\n","      <td>California</td>\n","      <td>1.43</td>\n","      <td>Ca</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>us20006i0y</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>21.514600</td>\n","      <td>94.572100</td>\n","      <td>Burma</td>\n","      <td>4.90</td>\n","      <td>Bu</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>nc72666891</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>37.576500</td>\n","      <td>-118.859167</td>\n","      <td>California</td>\n","      <td>0.06</td>\n","      <td>Ca</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>nc72666896</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>37.595833</td>\n","      <td>-118.994833</td>\n","      <td>California</td>\n","      <td>0.40</td>\n","      <td>Ca</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>nn00553447</td>\n","      <td>2016</td>\n","      <td>7</td>\n","      <td>27</td>\n","      <td>39.377500</td>\n","      <td>-119.845000</td>\n","      <td>Nevada</td>\n","      <td>0.30</td>\n","      <td>Ne</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>8389</th>\n","      <td>nc72685246</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>36.515499</td>\n","      <td>-121.099831</td>\n","      <td>California</td>\n","      <td>2.42</td>\n","      <td>Ca</td>\n","    </tr>\n","    <tr>\n","      <th>8390</th>\n","      <td>ak13879193</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>61.498400</td>\n","      <td>-149.862700</td>\n","      <td>Alaska</td>\n","      <td>1.40</td>\n","      <td>Al</td>\n","    </tr>\n","    <tr>\n","      <th>8391</th>\n","      <td>nc72685251</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>38.805000</td>\n","      <td>-122.821503</td>\n","      <td>California</td>\n","      <td>1.06</td>\n","      <td>Ca</td>\n","    </tr>\n","    <tr>\n","      <th>8392</th>\n","      <td>ci37672328</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>34.308000</td>\n","      <td>-118.635333</td>\n","      <td>California</td>\n","      <td>1.55</td>\n","      <td>Ca</td>\n","    </tr>\n","    <tr>\n","      <th>8393</th>\n","      <td>ci37672360</td>\n","      <td>2016</td>\n","      <td>8</td>\n","      <td>25</td>\n","      <td>34.119167</td>\n","      <td>-116.933667</td>\n","      <td>California</td>\n","      <td>0.89</td>\n","      <td>Ca</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>8394 rows × 9 columns</p>\n","</div>"]},"metadata":{},"execution_count":15}],"metadata":{}}]}