DataFrame Methods and Operations

The pandas DataFrame is the core data structure for data analysis in Python. It comes with hundreds of built-in methods that allow you to manipulate, analyze, and transform your data efficiently. This comprehensive guide covers the essential DataFrame methods you'll use in your daily data analysis workflow.

Understanding DataFrame Structure

Before diving into methods, let's understand what we're working with:

import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'salary': [50000, 60000, 75000, 55000, 68000],
    'department': ['Sales', 'Engineering', 'Sales', 'Marketing', 'Engineering'],
    'start_date': ['2020-01-15', '2019-03-22', '2018-07-01', '2021-02-10', '2020-05-18']
}

df = pd.DataFrame(data)
df['start_date'] = pd.to_datetime(df['start_date'])

print("Sample DataFrame:")
print(df)
print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Index: {list(df.index)}")

Data Inspection Methods

Basic Information Methods

# Get basic information about the DataFrame
print("=== BASIC INFORMATION ===")

# Shape and size
print(f"Shape (rows, columns): {df.shape}")
print(f"Size (total elements): {df.size}")
print(f"Number of dimensions: {df.ndim}")

# Column and index info
print(f"Columns: {list(df.columns)}")
print(f"Index: {list(df.index)}")
print(f"Data types:\n{df.dtypes}")

# Memory usage
print(f"\nMemory usage:\n{df.memory_usage(deep=True)}")

# Detailed information
print("\nDetailed info:")
df.info()

Data Preview Methods

# View data samples
print("=== DATA PREVIEW ===")

# First and last rows
print("First 3 rows:")
print(df.head(3))

print("\nLast 2 rows:")
print(df.tail(2))

# Random sample
print("\nRandom 2 rows:")
print(df.sample(2))

# Quick statistical summary
print("\nStatistical summary:")
print(df.describe())

# Include all columns (including non-numeric)
print("\nComplete description:")
print(df.describe(include='all'))

Missing Data Detection

# Check for missing values
print("=== MISSING DATA ===")

# Count missing values per column
print("Missing values per column:")
print(df.isnull().sum())

# Check if any missing values exist
print(f"Any missing values: {df.isnull().any().any()}")

# Get percentage of missing values
missing_percent = (df.isnull().sum() / len(df)) * 100
print("\nMissing data percentage:")
print(missing_percent[missing_percent > 0])

# Show rows with missing values
if df.isnull().any().any():
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])

Data Selection and Filtering Methods

Column Selection

# Select single column (returns Series)
ages = df['age']
print(f"Ages (Series): {type(ages)}")
print(ages)

# Select multiple columns (returns DataFrame)
subset = df[['name', 'salary']]
print(f"\nName and salary (DataFrame): {type(subset)}")
print(subset)

# Select columns by data type
numeric_cols = df.select_dtypes(include=['number'])
print("\nNumeric columns:")
print(numeric_cols)

# Select string columns
string_cols = df.select_dtypes(include=['object'])
print("\nString columns:")
print(string_cols)

# Select datetime columns
date_cols = df.select_dtypes(include=['datetime'])
print("\nDateTime columns:")
print(date_cols)

Row Selection and Filtering

# Filter rows based on conditions
print("=== ROW FILTERING ===")

# Single condition
high_earners = df[df['salary'] > 60000]
print("High earners (salary > 60,000):")
print(high_earners)

# Multiple conditions with & (and)
young_engineers = df[(df['age'] < 35) & (df['department'] == 'Engineering')]
print("\nYoung engineers:")
print(young_engineers)

# Multiple conditions with | (or)
sales_or_marketing = df[(df['department'] == 'Sales') | (df['department'] == 'Marketing')]
print("\nSales or Marketing employees:")
print(sales_or_marketing)

# Using isin() for multiple values
tech_depts = df[df['department'].isin(['Engineering', 'IT'])]
print("\nTech department employees:")
print(tech_depts)

# String filtering
names_with_a = df[df['name'].str.contains('a', case=False)]
print("\nNames containing 'a':")
print(names_with_a)

Advanced Selection with loc and iloc

print("=== ADVANCED SELECTION ===")

# loc: label-based selection
print("Using loc (label-based):")
print(df.loc[0:2, 'name':'salary'])  # Rows 0-2, columns name to salary

# Select specific rows and columns
print("\nSpecific rows and columns with loc:")
print(df.loc[[0, 2, 4], ['name', 'department']])

# Conditional selection with loc
print("\nConditional selection with loc:")
print(df.loc[df['age'] > 30, ['name', 'age']])

# iloc: position-based selection
print("\nUsing iloc (position-based):")
print(df.iloc[0:3, 1:4])  # First 3 rows, columns 1-3

# Select specific positions
print("\nSpecific positions with iloc:")
print(df.iloc[[0, 2], [0, 3]])  # Rows 0,2 and columns 0,3

Data Modification Methods

Adding and Modifying Columns

# Create new columns
print("=== ADDING COLUMNS ===")

# Simple calculation
df['salary_k'] = df['salary'] / 1000
print("Added salary in thousands:")
print(df[['name', 'salary', 'salary_k']])

# Conditional column creation
df['seniority'] = df['age'].apply(lambda x: 'Senior' if x >= 35 else 'Junior')
print("\nAdded seniority column:")
print(df[['name', 'age', 'seniority']])

# Using np.where for conditional assignment
df['high_earner'] = np.where(df['salary'] > 60000, 'Yes', 'No')
print("\nAdded high earner flag:")
print(df[['name', 'salary', 'high_earner']])

# Multiple conditions with np.select
conditions = [
    df['salary'] < 55000,
    (df['salary'] >= 55000) & (df['salary'] < 65000),
    df['salary'] >= 65000
]
choices = ['Low', 'Medium', 'High']
df['salary_tier'] = np.select(conditions, choices, default='Unknown')
print("\nAdded salary tiers:")
print(df[['name', 'salary', 'salary_tier']])

Modifying Existing Data

print("=== MODIFYING DATA ===")

# Modify single values
df.loc[0, 'salary'] = 52000
print("Modified Alice's salary:")
print(df.loc[0])

# Modify multiple values
df.loc[df['department'] == 'Sales', 'salary'] *= 1.1  # 10% raise for Sales
print("\nSales department after 10% raise:")
print(df[df['department'] == 'Sales'][['name', 'salary']])

# Apply function to column
df['name_upper'] = df['name'].str.upper()
print("\nAdded uppercase names:")
print(df[['name', 'name_upper']])

# Transform multiple columns
df[['salary', 'salary_k']] = df[['salary', 'salary_k']].round(2)
print("\nRounded salary columns:")
print(df[['salary', 'salary_k']])

Dropping Data

print("=== DROPPING DATA ===")

# Drop columns
df_no_upper = df.drop('name_upper', axis=1)
print("Dropped name_upper column:")
print(df_no_upper.columns.tolist())

# Drop multiple columns
df_minimal = df.drop(['salary_k', 'high_earner', 'salary_tier'], axis=1)
print("Dropped multiple columns:")
print(df_minimal.columns.tolist())

# Drop rows by index
df_no_first = df.drop(0, axis=0)
print("Dropped first row:")
print(df_no_first)

# Drop rows by condition
df_no_sales = df[df['department'] != 'Sales']
print("Removed Sales department:")
print(df_no_sales)

# Remove duplicates
df_unique = df.drop_duplicates(subset=['department'])
print("One row per department:")
print(df_unique[['name', 'department']])

Data Sorting Methods

print("=== SORTING DATA ===")

# Sort by single column
df_by_age = df.sort_values('age')
print("Sorted by age (ascending):")
print(df_by_age[['name', 'age']])

# Sort descending
df_by_salary_desc = df.sort_values('salary', ascending=False)
print("\nSorted by salary (descending):")
print(df_by_salary_desc[['name', 'salary']])

# Sort by multiple columns
df_multi_sort = df.sort_values(['department', 'salary'], ascending=[True, False])
print("\nSorted by department (asc) then salary (desc):")
print(df_multi_sort[['name', 'department', 'salary']])

# Sort by index
df_by_index = df.sort_index()
print("\nSorted by index:")
print(df_by_index)

# Get sorted index positions
salary_ranks = df['salary'].argsort()
print(f"\nSalary rank positions: {salary_ranks.tolist()}")

Grouping and Aggregation Methods

print("=== GROUPING AND AGGREGATION ===")

# Group by single column
dept_groups = df.groupby('department')
print("Groups by department:")
for name, group in dept_groups:
    print(f"\n{name}:")
    print(group[['name', 'salary']])

# Aggregate functions
print("\nDepartment salary statistics:")
dept_stats = df.groupby('department')['salary'].agg(['count', 'mean', 'std', 'min', 'max'])
print(dept_stats)

# Multiple aggregations on different columns
print("\nMultiple aggregations:")
multi_agg = df.groupby('department').agg({
    'salary': ['mean', 'std'],
    'age': ['mean', 'min', 'max'],
    'name': 'count'
})
print(multi_agg)

# Custom aggregation functions
print("\nCustom aggregations:")
custom_agg = df.groupby('department').agg({
    'salary': lambda x: x.max() - x.min(),  # Salary range
    'age': lambda x: f"{x.min()}-{x.max()}"  # Age range as string
})
print(custom_agg)

# Transform (returns same shape as original)
df['dept_avg_salary'] = df.groupby('department')['salary'].transform('mean')
print("\nAdded department average salary:")
print(df[['name', 'department', 'salary', 'dept_avg_salary']])

String Methods for Object Columns

print("=== STRING METHODS ===")

# String operations on name column
print("Original names:")
print(df['name'].tolist())

# Case operations
print("\nUppercase names:")
print(df['name'].str.upper().tolist())

print("Lowercase names:")
print(df['name'].str.lower().tolist())

print("Title case names:")
print(df['name'].str.title().tolist())

# String length
df['name_length'] = df['name'].str.len()
print("\nName lengths:")
print(df[['name', 'name_length']])

# String contains
print("\nNames containing 'a':")
contains_a = df[df['name'].str.contains('a', case=False)]
print(contains_a['name'].tolist())

# String replacement
df['name_initials'] = df['name'].str.replace(r'([A-Z])[a-z]+', r'\1.', regex=True)
print("\nName initials:")
print(df[['name', 'name_initials']])

# Split strings
df['first_letter'] = df['name'].str[0]  # First character
print("\nFirst letters:")
print(df[['name', 'first_letter']])

Mathematical and Statistical Methods

print("=== MATHEMATICAL OPERATIONS ===")

# Basic statistics
print("Salary statistics:")
print(f"Mean: {df['salary'].mean():.2f}")
print(f"Median: {df['salary'].median():.2f}")
print(f"Standard deviation: {df['salary'].std():.2f}")
print(f"Variance: {df['salary'].var():.2f}")
print(f"Min: {df['salary'].min()}")
print(f"Max: {df['salary'].max()}")

# Quantiles
print("\nSalary quantiles:")
print(df['salary'].quantile([0.25, 0.5, 0.75]))

# Cumulative operations
df['salary_cumsum'] = df['salary'].cumsum()
df['salary_cummax'] = df['salary'].cummax()
print("\nCumulative operations:")
print(df[['name', 'salary', 'salary_cumsum', 'salary_cummax']])

# Correlation
numeric_df = df.select_dtypes(include=['number'])
print("\nCorrelation matrix:")
print(numeric_df.corr())

# Rolling operations (useful for time series)
df_sorted = df.sort_values('start_date')
df_sorted['salary_rolling_mean'] = df_sorted['salary'].rolling(window=2).mean()
print("\n2-period rolling mean of salary:")
print(df_sorted[['name', 'start_date', 'salary', 'salary_rolling_mean']])

Data Type Conversion Methods

print("=== DATA TYPE CONVERSIONS ===")

# Check current data types
print("Current data types:")
print(df.dtypes)

# Convert numeric columns
df['age_float'] = df['age'].astype('float64')
print(f"\nAge as float: {df['age_float'].dtype}")

# Convert to categorical
df['department_cat'] = df['department'].astype('category')
print(f"Department as category: {df['department_cat'].dtype}")
print(f"Categories: {df['department_cat'].cat.categories.tolist()}")

# Convert strings to datetime
if 'start_date' in df.columns and df['start_date'].dtype == 'object':
    df['start_date'] = pd.to_datetime(df['start_date'])
print(f"Start date type: {df['start_date'].dtype}")

# Convert boolean-like strings
df['active'] = ['Yes', 'No', 'Yes', 'Yes', 'No']
df['active_bool'] = df['active'].map({'Yes': True, 'No': False})
print(f"\nActive as boolean: {df['active_bool'].dtype}")

# Numeric conversion with error handling
df['salary_str'] = df['salary'].astype(str)
df['salary_numeric'] = pd.to_numeric(df['salary_str'], errors='coerce')
print("Converted string back to numeric")

Handling Missing Data Methods

print("=== MISSING DATA HANDLING ===")

# Create some missing data for demonstration
df_missing = df.copy()
df_missing.loc[1, 'age'] = np.nan
df_missing.loc[3, 'salary'] = np.nan

print("DataFrame with missing values:")
print(df_missing.isnull().sum())

# Detect missing data
print("\nRows with missing data:")
print(df_missing[df_missing.isnull().any(axis=1)])

# Fill missing values
# Forward fill
df_ffill = df_missing.fillna(method='ffill')
print("\nForward fill:")
print(df_ffill.loc[[0, 1, 2], ['name', 'age']])

# Backward fill
df_bfill = df_missing.fillna(method='bfill')
print("\nBackward fill:")
print(df_bfill.loc[[0, 1, 2], ['name', 'age']])

# Fill with specific values
df_filled = df_missing.fillna({'age': df_missing['age'].mean(), 'salary': 0})
print("\nFilled with mean age and 0 salary:")
print(df_filled.loc[[1, 3], ['name', 'age', 'salary']])

# Interpolate missing values
df_missing_numeric = df_missing.select_dtypes(include=['number'])
df_interpolated = df_missing_numeric.interpolate()
print("\nInterpolated values:")
print(df_interpolated)

# Drop missing data
df_complete = df_missing.dropna()
print(f"\nOriginal shape: {df_missing.shape}")
print(f"After dropping NA: {df_complete.shape}")

# Drop columns with any missing values
df_no_missing_cols = df_missing.dropna(axis=1)
print(f"After dropping columns with missing values: {df_no_missing_cols.shape}")

DataFrame Combination Methods

print("=== COMBINING DATAFRAMES ===")

# Create additional DataFrames for demonstration
df2 = pd.DataFrame({
    'name': ['Frank', 'Grace'],
    'age': [29, 26],
    'salary': [58000, 62000],
    'department': ['HR', 'Finance']
})

df_bonus = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'bonus': [5000, 7000, 8000]
})

# Concatenate (stack DataFrames)
df_combined = pd.concat([df.iloc[:3], df2], ignore_index=True)
print("Concatenated DataFrames:")
print(df_combined)

# Merge (join) DataFrames
df_with_bonus = pd.merge(df, df_bonus, on='name', how='left')
print("\nMerged with bonus data:")
print(df_with_bonus[['name', 'salary', 'bonus']])

# Different join types
inner_join = pd.merge(df, df_bonus, on='name', how='inner')
print(f"\nInner join shape: {inner_join.shape}")

outer_join = pd.merge(df, df_bonus, on='name', how='outer')
print(f"Outer join shape: {outer_join.shape}")

# Append (deprecated but still used)
# df_appended = df.append(df2, ignore_index=True)  # Use concat instead

Reshaping Methods

print("=== RESHAPING DATA ===")

# Pivot table
pivot_data = df.groupby(['department', 'seniority']).size().reset_index(name='count')
pivot_table = pivot_data.pivot(index='department', columns='seniority', values='count').fillna(0)
print("Pivot table:")
print(pivot_table)

# Melt (wide to long format)
df_wide = pd.DataFrame({
    'name': ['Alice', 'Bob'],
    'Q1_sales': [100, 120],
    'Q2_sales': [110, 130],
    'Q3_sales': [105, 125]
})

df_long = df_wide.melt(id_vars=['name'], 
                       var_name='quarter', 
                       value_name='sales')
print("\nMelted DataFrame (wide to long):")
print(df_long)

# Pivot (long to wide format)
df_wide_again = df_long.pivot(index='name', columns='quarter', values='sales')
print("\nPivoted back to wide format:")
print(df_wide_again)

# Stack and unstack
df_multi_index = df.set_index(['department', 'name'])
stacked = df_multi_index[['age', 'salary']].stack()
print("\nStacked data:")
print(stacked.head())

unstacked = stacked.unstack()
print("\nUnstacked data:")
print(unstacked)

Advanced Methods and Operations

print("=== ADVANCED OPERATIONS ===")

# Apply custom functions
def categorize_salary(salary):
    if salary < 55000:
        return 'Entry Level'
    elif salary < 65000:
        return 'Mid Level'
    else:
        return 'Senior Level'

df['salary_category'] = df['salary'].apply(categorize_salary)
print("Applied custom function:")
print(df[['name', 'salary', 'salary_category']])

# Apply function to multiple columns
def full_info(row):
    return f"{row['name']} ({row['age']}) - {row['department']}"

df['full_info'] = df.apply(full_info, axis=1)
print("\nApplied function to multiple columns:")
print(df['full_info'])

# Map values
dept_codes = {'Sales': 'SL', 'Engineering': 'EG', 'Marketing': 'MK'}
df['dept_code'] = df['department'].map(dept_codes)
print("\nMapped department codes:")
print(df[['department', 'dept_code']])

# Replace values
df_replaced = df.replace({'Sales': 'Sales Department', 'Engineering': 'Tech Department'})
print("\nReplaced values:")
print(df_replaced['department'].unique())

# Query method (alternative to boolean indexing)
high_earners = df.query('salary > 60000 and age < 35')
print("\nQuery results (high earners under 35):")
print(high_earners[['name', 'age', 'salary']])

# Rank values
df['salary_rank'] = df['salary'].rank(ascending=False)
print("\nSalary rankings:")
print(df[['name', 'salary', 'salary_rank']].sort_values('salary_rank'))

Performance Optimization Methods

print("=== PERFORMANCE OPTIMIZATION ===")

# Check memory usage
print("Memory usage:")
print(df.memory_usage(deep=True))

# Optimize data types
def optimize_dtypes(df):
    """Optimize DataFrame data types for memory efficiency"""
    optimized_df = df.copy()
    
    # Optimize integer columns
    for col in optimized_df.select_dtypes(include=['int64']):
        col_min = optimized_df[col].min()
        col_max = optimized_df[col].max()
        
        if col_min >= 0:  # Unsigned integers
            if col_max < 255:
                optimized_df[col] = optimized_df[col].astype('uint8')
            elif col_max < 65535:
                optimized_df[col] = optimized_df[col].astype('uint16')
            elif col_max < 4294967295:
                optimized_df[col] = optimized_df[col].astype('uint32')
        else:  # Signed integers
            if col_min > -128 and col_max < 127:
                optimized_df[col] = optimized_df[col].astype('int8')
            elif col_min > -32768 and col_max < 32767:
                optimized_df[col] = optimized_df[col].astype('int16')
            elif col_min > -2147483648 and col_max < 2147483647:
                optimized_df[col] = optimized_df[col].astype('int32')
    
    # Convert object columns with low cardinality to category
    for col in optimized_df.select_dtypes(include=['object']):
        if optimized_df[col].nunique() / len(optimized_df) < 0.5:
            optimized_df[col] = optimized_df[col].astype('category')
    
    return optimized_df

# Apply optimization
original_memory = df.memory_usage(deep=True).sum()
df_optimized = optimize_dtypes(df)
optimized_memory = df_optimized.memory_usage(deep=True).sum()

print(f"Original memory: {original_memory} bytes")
print(f"Optimized memory: {optimized_memory} bytes")
print(f"Reduction: {(original_memory - optimized_memory) / original_memory * 100:.1f}%")

# Use vectorized operations instead of loops
# Bad: using a loop
# result = []
# for index, row in df.iterrows():
#     result.append(row['salary'] * 1.1)

# Good: vectorized operation
df['salary_with_raise'] = df['salary'] * 1.1
print("\nUsed vectorized operation for salary calculation")

Method Chaining

print("=== METHOD CHAINING ===")

# Chain multiple operations together
result = (df
          .query('salary > 50000')  # Filter high earners
          .groupby('department')['salary']  # Group by department
          .mean()  # Calculate mean
          .sort_values(ascending=False)  # Sort descending
          .round(2)  # Round to 2 decimal places
         )

print("Chained operations result:")
print(result)

# Complex chaining example
analysis_result = (df
                  .assign(
                      salary_normalized=lambda x: (x['salary'] - x['salary'].mean()) / x['salary'].std(),
                      age_group=lambda x: pd.cut(x['age'], bins=[0, 30, 40, 100], labels=['Young', 'Mid', 'Senior'])
                  )
                  .groupby(['department', 'age_group'])
                  .agg({
                      'salary': ['mean', 'count'],
                      'salary_normalized': 'mean'
                  })
                  .round(2)
                  )

print("\nComplex chained analysis:")
print(analysis_result)

Export and Save Methods

print("=== EXPORT METHODS ===")

# Save to different formats
df.to_csv('employees.csv', index=False)
print("Saved to CSV")

df.to_excel('employees.xlsx', index=False, sheet_name='Employees')
print("Saved to Excel")

# Save to JSON
df.to_json('employees.json', orient='records', date_format='iso')
print("Saved to JSON")

# Save to pickle (preserves all pandas data types)
df.to_pickle('employees.pkl')
print("Saved to Pickle")

# Save to Stata format
try:
    df.to_stata('employees.dta', write_index=False)
    print("Saved to Stata format")
except Exception as e:
    print(f"Could not save to Stata: {e}")

# Custom CSV export with specific formatting
df.to_csv('employees_formatted.csv', 
          index=False,
          float_format='%.2f',
          date_format='%Y-%m-%d')
print("Saved formatted CSV")

Key Takeaways

Essential DataFrame Methods to Remember:

Data Inspection:

head(), tail(), info(), describe(), shape, dtypes
isnull(), nunique(), value_counts()

Data Selection:

loc[], iloc[], bracket notation []
query(), select_dtypes()

Data Manipulation:

assign(), drop(), rename(), sort_values()
groupby(), apply(), map(), replace()

Data Cleaning:

fillna(), dropna(), duplicated(), drop_duplicates()
astype(), pd.to_numeric(), pd.to_datetime()

Data Combination:

merge(), concat(), join()
pivot(), melt(), pivot_table()

Performance:

Use vectorized operations over loops
Chain methods for readable code
Optimize data types for memory efficiency

What's Next?

Now that you understand DataFrame methods, explore:

Mastering DataFrame methods is crucial for efficient data analysis. Practice these methods with your own datasets to build muscle memory and discover new ways to manipulate your data.