Dev In The Mountain Header
A Developer In The mountains having fun

DataFrame Quick Reference for Stata Users

This is a comprehensive quick reference guide for pandas DataFrame operations, with special focus on functions commonly used when working with Stata-format data. Perfect for economists, social scientists, and researchers transitioning from Stata to Python.

File I/O Operations

Reading Data

# Basic Stata file reading
df = pd.read_stata('data.dta')

# With options (recommended for Stata files)
df = pd.read_stata('data.dta',
                   convert_dates=True,           # Convert Stata dates
                   convert_categoricals=True,    # Convert labeled variables
                   preserve_dtypes=True,         # Keep original data types
                   columns=['var1', 'var2'])     # Read specific variables

# Handle large files
df = pd.read_stata('large_file.dta', chunksize=10000)  # Returns iterator

Writing Data

# Save as Stata file
df.to_stata('output.dta')

# With options
df.to_stata('output.dta',
            write_index=False,         # Don't write index as variable
            version=117,               # Stata version compatibility
            data_label='My Dataset')   # Dataset label

Data Inspection (Stata: describe, summarize, list)

Basic Information

df.info()                    # Stata: describe
df.shape                     # (rows, columns) - Stata: describe, brief
df.dtypes                    # Variable types
df.memory_usage(deep=True)   # Memory usage
df.columns.tolist()          # Variable names
df.index                     # Observation index

Statistical Summary

df.describe()                           # Stata: summarize
df.describe(include='all')              # Include categorical variables
df.describe(percentiles=[.1, .5, .9])   # Custom percentiles
df[['var1', 'var2']].describe()         # Specific variables

# Individual statistics
df['variable'].mean()        # Mean
df['variable'].median()      # Median
df['variable'].std()         # Standard deviation
df['variable'].min()         # Minimum
df['variable'].max()         # Maximum
df['variable'].quantile(0.75) # 75th percentile

Data Preview

df.head(10)                  # Stata: list in 1/10
df.tail(5)                   # Last 5 observations
df.sample(10)                # Random 10 observations
df.iloc[100:110]             # Observations 100-109

Data Selection and Filtering

Variable Selection

df['varname']                     # Single variable (Series)
df[['var1', 'var2', 'var3']]     # Multiple variables (DataFrame)
df.loc[:, 'var1':'var5']         # Range of variables
df.filter(regex='^income')       # Variables starting with 'income'
df.select_dtypes(include=[np.number])  # Numeric variables only

Observation Selection

# By position (Stata: in 1/100)
df.iloc[0:100]               # First 100 observations
df.iloc[-10:]                # Last 10 observations

# By condition (Stata: if condition)
df[df['age'] > 25]                           # Age greater than 25
df[df['income'].between(30000, 80000)]       # Income between values
df[df['status'].isin(['employed', 'student'])] # Multiple category values
df[(df['age'] > 18) & (df['income'] < 50000)] # Multiple conditions
df[df['name'].str.contains('John')]          # String contains
df[df['variable'].notna()]                   # Non-missing values only

Advanced Selection

# Stata: keep/drop
df_subset = df[['var1', 'var2', 'var3']]     # keep var1 var2 var3
df_dropped = df.drop(['var1', 'var2'], axis=1) # drop var1 var2

# Sample selection
df.sample(n=1000)            # Random 1000 observations
df.sample(frac=0.1)          # Random 10% of data
df.query('age > 25 and income < 50000')  # SQL-like syntax

Data Manipulation

Creating New Variables (Stata: generate)

# Simple calculations
df['age_squared'] = df['age'] ** 2
df['log_income'] = np.log(df['income'])
df['income_thousands'] = df['income'] / 1000

# Conditional variables (Stata: generate var = ... if ...)
df['adult'] = df['age'] >= 18
df['income_category'] = pd.cut(df['income'], 
                               bins=[0, 30000, 60000, 100000, np.inf],
                               labels=['Low', 'Medium', 'High', 'Very High'])

# Using numpy.where (Stata: generate var = cond(condition, value1, value2))
df['employment_status'] = np.where(df['employed'] == 1, 'Employed', 'Unemployed')

# Complex conditions
def categorize_age(age):
    if age < 18:
        return 'Minor'
    elif age < 65:
        return 'Adult'
    else:
        return 'Senior'

df['age_group'] = df['age'].apply(categorize_age)

Modifying Existing Variables (Stata: replace)

# Replace values
df.loc[df['age'] < 0, 'age'] = np.nan  # Set negative ages to missing
df['income'] = df['income'].clip(lower=0)  # Cap at minimum value
df['name'] = df['name'].str.title()    # Title case for names

# Recoding variables
df['education'] = df['education'].replace({
    1: 'Elementary', 2: 'High School', 3: 'College', 4: 'Graduate'
})

Missing Data Handling

Identifying Missing Data

df.isnull().sum()            # Count missing by variable
df.isnull().sum() / len(df) * 100  # Percentage missing
df[df.isnull().any(axis=1)]  # Rows with any missing values
df.dropna(how='all')         # Drop completely empty rows

Handling Missing Data

# Drop missing (Stata: drop if missing(...))
df.dropna()                  # Drop rows with any missing
df.dropna(subset=['income']) # Drop if income is missing
df.dropna(thresh=5)          # Keep rows with at least 5 non-missing values

# Fill missing values
df.fillna(0)                 # Fill all missing with 0
df['income'].fillna(df['income'].mean())  # Fill with mean
df.fillna(method='ffill')    # Forward fill
df.fillna(method='bfill')    # Backward fill

Grouping and Aggregation (Stata: by, collapse)

Basic Grouping

# Group statistics (Stata: by var: summarize)
df.groupby('category')['income'].mean()
df.groupby('region').agg({'income': ['mean', 'median', 'std']})

# Multiple grouping variables
df.groupby(['region', 'education'])['income'].mean()

# Custom aggregations
df.groupby('category').agg({
    'income': ['mean', 'std', 'count'],
    'age': ['min', 'max'],
    'employed': 'sum'
})

Advanced Grouping

# Transform (create group statistics as new variables)
df['income_group_mean'] = df.groupby('category')['income'].transform('mean')
df['income_deviation'] = df['income'] - df['income_group_mean']

# Filter groups
df.groupby('category').filter(lambda x: len(x) > 100)  # Groups with >100 obs

# Apply custom functions to groups
def group_stats(group):
    return pd.Series({
        'count': len(group),
        'mean_income': group['income'].mean(),
        'income_range': group['income'].max() - group['income'].min()
    })

df.groupby('category').apply(group_stats)

Merging and Joining (Stata: merge, append)

Merging DataFrames

# Inner join (Stata: merge 1:1 using, keep(match))
merged = pd.merge(df1, df2, on='id', how='inner')

# Left join (Stata: merge 1:1 using, keep(master match))
merged = pd.merge(df1, df2, on='id', how='left')

# Merge on multiple keys
merged = pd.merge(df1, df2, on=['id', 'year'], how='outer')

# Merge with different column names
merged = pd.merge(df1, df2, left_on='id', right_on='person_id')

Concatenating (Stata: append)

# Append rows (Stata: append using)
combined = pd.concat([df1, df2], ignore_index=True)

# Append with keys to identify source
combined = pd.concat([df1, df2], keys=['Dataset1', 'Dataset2'])

Reshaping Data (Stata: reshape)

Wide to Long (Stata: reshape long)

# Melt multiple columns
df_long = pd.melt(df, 
                  id_vars=['id', 'name'],
                  value_vars=['income_2020', 'income_2021', 'income_2022'],
                  var_name='year', 
                  value_name='income')

Long to Wide (Stata: reshape wide)

# Pivot table
df_wide = df_long.pivot(index='id', columns='year', values='income')

# Pivot with aggregation
df_wide = df_long.pivot_table(index='id', 
                              columns='year', 
                              values='income', 
                              aggfunc='mean')

Frequency Tables and Cross-tabulation (Stata: tabulate)

One-way Tables

df['category'].value_counts()           # Stata: tab category
df['category'].value_counts(normalize=True) # Proportions
df['category'].value_counts().sort_index()  # Sorted by category

Two-way Tables

# Cross-tabulation (Stata: tab var1 var2)
pd.crosstab(df['education'], df['employed'])

# With proportions
pd.crosstab(df['education'], df['employed'], normalize='index')  # Row percentages
pd.crosstab(df['education'], df['employed'], normalize='columns') # Column percentages
pd.crosstab(df['education'], df['employed'], normalize=True)     # Cell percentages

# With margins (totals)
pd.crosstab(df['education'], df['employed'], margins=True)

Sorting and Ordering (Stata: sort, gsort)

Basic Sorting

df.sort_values('income')                    # Stata: sort income
df.sort_values('income', ascending=False)   # Stata: gsort -income
df.sort_values(['region', 'income'])        # Multiple variables
df.sort_values(['region', 'income'], 
               ascending=[True, False])     # Mixed sort order
df.sort_index()                             # Sort by index

Data Types and Conversion

Checking and Converting Types

df.dtypes                           # Check current types
df['variable'].dtype                # Single variable type

# Convert types
df['id'] = df['id'].astype('int')
df['category'] = df['category'].astype('category')  # Saves memory
df['date'] = pd.to_datetime(df['date'])
df['text'] = df['text'].astype('string')

# Convert multiple columns
df = df.astype({'id': 'int', 'score': 'float', 'category': 'category'})

String Operations (Stata: string functions)

Basic String Operations

df['text'].str.upper()              # Stata: upper(text)
df['text'].str.lower()              # Stata: lower(text)
df['text'].str.title()              # Title case
df['text'].str.len()                # Stata: length(text)
df['text'].str.contains('pattern')   # String contains pattern
df['text'].str.startswith('prefix') # Starts with prefix
df['text'].str.replace('old', 'new') # Stata: subinstr(text, "old", "new", .)
df['text'].str.strip()              # Remove leading/trailing spaces
df['text'].str.split('delimiter')   # Split string

Date and Time Operations

Date Handling

# Convert to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract date components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek

# Date arithmetic
df['days_since'] = (pd.Timestamp.now() - df['date']).dt.days

Performance Tips for Large Stata Files

Memory Optimization

# Read specific columns only
df = pd.read_stata('large_file.dta', columns=['var1', 'var2', 'var3'])

# Use efficient data types
df['id'] = df['id'].astype('int32')        # Instead of int64
df['category'] = df['category'].astype('category')  # For repeated strings

# Read in chunks for very large files
chunk_list = []
for chunk in pd.read_stata('huge_file.dta', chunksize=10000):
    processed_chunk = chunk[chunk['income'] > 0]  # Process each chunk
    chunk_list.append(processed_chunk)
df = pd.concat(chunk_list, ignore_index=True)

Efficient Operations

# Use vectorized operations instead of loops
df['new_var'] = df['var1'] * df['var2']  # Fast
# Instead of: df['new_var'] = df.apply(lambda x: x['var1'] * x['var2'], axis=1)  # Slow

# Use query for complex filtering (can be faster)
df.query('income > 50000 and age < 65')

# Use categorical data for memory savings
df['region'] = df['region'].astype('category')

Common Stata-to-Pandas Equivalents

Stata CommandPandas Equivalent
describedf.info(), df.describe()
summarize vardf['var'].describe()
list in 1/10df.head(10)
countlen(df)
tab vardf['var'].value_counts()
tab var1 var2pd.crosstab(df['var1'], df['var2'])
gen newvar = oldvar * 2df['newvar'] = df['oldvar'] * 2
replace var = 0 if var < 0df.loc[df['var'] < 0, 'var'] = 0
drop vardf.drop('var', axis=1)
keep var1 var2df[['var1', 'var2']]
sort vardf.sort_values('var')
by group: sum vardf.groupby('group')['var'].sum()
collapse (mean) var, by(group)df.groupby('group')['var'].mean()
merge 1:1 id using filepd.merge(df1, df2, on='id')
append using filepd.concat([df1, df2])

Quick Troubleshooting

Common Issues with Stata Files

# If dates aren't converting properly
df = pd.read_stata('file.dta', convert_dates=False)
df['date'] = pd.to_datetime(df['date'], format='%d%b%Y')

# If categorical variables show as numbers
df = pd.read_stata('file.dta', convert_categoricals=True)

# If getting encoding errors
df = pd.read_stata('file.dta', encoding='latin1')

# If file is too large
for chunk in pd.read_stata('file.dta', chunksize=5000):
    # Process chunk by chunk
    processed = chunk[chunk['var'] > 0]
    # Save or append to results

This reference guide covers the most commonly used DataFrame operations for working with Stata data in pandas. Bookmark this page for quick access to syntax and examples.

More places to find me
Mental Health
follow me on Mastodon