# ==================== PANDAS CHEAT SHEET ====================
import pandas as pd
# ============ READING/WRITING DATA ============
df = pd.read_csv('file.csv')
df = pd.read_excel('file.xls')
df.to_csv('output.csv', index=False)
df.to_sql('table', conn, index=False)
# ============ BASIC INFO ============
df.head() # first 5 rows
df.tail(3) # last 3 rows
df.shape # (rows, columns) -> (100, 5)
df.columns # column names
df.dtypes # data types
df.info() # summary
df.describe() # statistics
# ============ SELECTING DATA ============
df['col'] # single column (Series)
df[['col1', 'col2']] # multiple columns (DataFrame)
df.loc[0] # row by label/index
df.loc[0:5, 'col'] # rows 0-5, specific column
df.iloc[0:5, 0:2] # by position (first 5 rows, first 2 cols)
# ============ FILTERING ============
df[df['age'] > 30] # where age > 30
df[df['country'] == 'Italy'] # where country is Italy
df[df['country'].isin(['Italy', 'Spain'])] # where country in list
df[(df['age'] > 30) & (df['salary'] > 50000)] # multiple conditions
# ============ UNIQUE VALUES ============
df['country'].unique() # array of unique values -> ['Italy', 'Spain', 'France']
df['country'].nunique() # count unique -> 3
df['country'].value_counts()
# Italy 10
# Spain 8
# France 5
# ============ MISSING DATA ============
df.isna().sum() # count NaN per column
df.dropna() # remove rows with NaN
df.fillna(0) # replace NaN with 0
# ============ GROUPBY ============
df.groupby('country')['salary'].mean()
# country
# France 45000
# Italy 52000
# Spain 48000
df.groupby('country').agg({'salary': 'mean', 'age': 'max'})
# salary age
# France 45000 55
# Italy 52000 60
# ============ SORTING ============
df.sort_values('salary') # ascending
df.sort_values('salary', ascending=False) # descending
df.sort_values(['country', 'salary']) # multiple columns
# ============ ADDING/MODIFYING COLUMNS ============
df['new_col'] = df['salary'] * 2
df['category'] = df['age'].apply(lambda x: 'old' if x > 50 else 'young')
# ============ RENAMING ============
df.rename(columns={'old_name': 'new_name'})
# ============ DROP ============
df.drop(columns=['col1', 'col2'])
df.drop(index=[0, 1, 2])
# ============ MERGE/JOIN ============
pd.merge(df1, df2, on='id') # inner join
pd.merge(df1, df2, on='id', how='left') # left join
# ============ CONCAT ============
pd.concat([df1, df2]) # stack vertically
pd.concat([df1, df2], axis=1) # stack horizontally
# ============ pd.cut() - BINNING ============
ages = pd.Series([15, 25, 35, 45, 55])
pd.cut(ages, bins=3, labels=['young', 'mid', 'old'])
# 0 young
# 1 young
# 2 mid
# 3 mid
# 4 old
# ============ QUICK PLOTTING ============
df['salary'].plot() # line plot
df['salary'].plot(kind='bar') # bar plot
df.plot(x='year', y='salary') # x vs y
df.groupby('country')['salary'].mean().plot(kind='bar')
# ============ COMMON AGGREGATIONS ============
df['col'].sum()
df['col'].mean()
df['col'].min()
df['col'].max()
df['col'].count()
df['col'].std()
# ==================== SQLITE + PANDAS CHEAT SHEET ====================
import sqlite3
import pandas as pd
# ============ CONNECT TO DATABASE ============
conn = sqlite3.connect('database.sqlite') # creates file if doesn't exist
conn.close() # always close when done
# ============ PANDAS TO SQLITE ============
conn = sqlite3.connect('mydb.sqlite')
# Write entire dataframe to SQLite table
df.to_sql('table_name', conn, index=False, if_exists='replace')
# if_exists options:
# 'fail' - error if table exists (default)
# 'replace' - drop table and recreate
# 'append' - add rows to existing table
conn.close()
# ============ SQLITE TO PANDAS ============
conn = sqlite3.connect('mydb.sqlite')
# Read entire table
df = pd.read_sql_query('SELECT * FROM table_name', conn)
# Read with filter
df = pd.read_sql_query('SELECT * FROM happiness WHERE year > 2015', conn)
# Read specific columns
df = pd.read_sql_query('SELECT country, year, salary FROM employees', conn)
# Read with multiple conditions
df = pd.read_sql_query('''
SELECT * FROM happiness
WHERE "Log GDP per capita" > 11.2
AND year >= 2010
''', conn)
conn.close()
# ============ IMPORTANT: COLUMN NAMES WITH SPACES ============
# Use double quotes around column names with spaces
df = pd.read_sql_query('SELECT "Country name", "Life Ladder" FROM happiness', conn)
# ============ COMMON SQL QUERIES ============
# Count rows
pd.read_sql_query('SELECT COUNT(*) FROM table_name', conn)
# Distinct values
pd.read_sql_query('SELECT DISTINCT country FROM happiness', conn)
# Order by
pd.read_sql_query('SELECT * FROM happiness ORDER BY year DESC', conn)
# Group by with aggregation
pd.read_sql_query('''
SELECT country, AVG(salary) as avg_salary
FROM employees
GROUP BY country
''', conn)
# ============ TYPICAL WORKFLOW ============
# 1. Read Excel/CSV
df = pd.read_excel('data.xls')
# 2. Select columns
df_subset = df[['col1', 'col2', 'col3']]
# 3. Save to SQLite
conn = sqlite3.connect('mydb.sqlite')
df_subset.to_sql('mytable', conn, index=False, if_exists='replace')
conn.close()
# 4. Later, read back with filter
conn = sqlite3.connect('mydb.sqlite')
df_filtered = pd.read_sql_query('SELECT * FROM mytable WHERE col1 > 100', conn)
conn.close()
# ============ MODIFY DATA & SAVE TO NEW DB ============
# Read from db1
conn1 = sqlite3.connect('db1.sqlite')
df = pd.read_sql_query('SELECT * FROM table1', conn1)
conn1.close()
# Modify in pandas
df['new_col'] = df['old_col'] * 10
df = df.drop(columns=['old_col'])
df = df.rename(columns={'new_col': 'better_name'})
# Save to db2
conn2 = sqlite3.connect('db2.sqlite')
df.to_sql('table1', conn2, index=False, if_exists='replace')
conn2.close()
# ============ FILE SIZE ============
import os
os.path.getsize('file.sqlite') # size in bytes
# ==================== MATPLOTLIB CHEAT SHEET ====================
import matplotlib.pyplot as plt
# ============ BASIC LINE PLOT ============
plt.plot([1, 2, 3, 4], [10, 20, 25, 30])
plt.show()
# ============ LINE PLOT WITH LABELS ============
plt.plot([2020, 2021, 2022], [100, 150, 130])
plt.xlabel('Year')
plt.ylabel('Sales')
plt.title('Sales Over Time')
plt.show()
# ============ MULTIPLE LINES (SAME PLOT) ============
plt.plot([2020, 2021, 2022], [100, 150, 130], label='Italy')
plt.plot([2020, 2021, 2022], [90, 120, 140], label='Spain')
plt.plot([2020, 2021, 2022], [80, 110, 160], label='France')
plt.legend() # shows the labels
plt.show()
# ============ BAR PLOT ============
plt.bar(['Italy', 'Spain', 'France'], [100, 90, 80])
plt.show()
# ============ BAR PLOT WITH OPTIONS ============
plt.bar(['Italy', 'Spain', 'France'], [100, 90, 80], color='green')
plt.title('GDP by Country')
plt.xticks(rotation=45) # rotate x labels
plt.tight_layout() # prevent labels from cutting off
plt.show()
# ============ HORIZONTAL BAR ============
plt.barh(['Italy', 'Spain', 'France'], [100, 90, 80])
plt.show()
# ============ SCATTER PLOT ============
plt.scatter([1, 2, 3, 4], [10, 20, 15, 30])
plt.show()
# ============ HISTOGRAM ============
data = [1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 5]
plt.hist(data, bins=5)
plt.show()
# ============ PIE CHART ============
plt.pie([30, 40, 30], labels=['A', 'B', 'C'])
plt.show()
# ============ PLOT FROM PANDAS DIRECTLY ============
df['salary'].plot() # line
df['salary'].plot(kind='bar') # bar
df.plot(x='year', y='salary') # x vs y
df.plot(x='year', y='salary', kind='scatter')
# ============ GROUPBY + PLOT ============
df.groupby('country')['salary'].mean().plot(kind='bar')
plt.title('Average Salary by Country')
plt.show()
# ============ MULTIPLE LINES FROM DATAFRAME ============
countries = ['Italy', 'Spain', 'France']
for country in countries:
data = df[df['country'] == country]
plt.plot(data['year'], data['value'], label=country)
plt.legend()
plt.show()
# ============ STYLING OPTIONS ============
plt.plot(x, y, color='red') # color
plt.plot(x, y, linestyle='--') # dashed line
plt.plot(x, y, marker='o') # dots on points
plt.plot(x, y, linewidth=2) # thicker line
# Combined:
plt.plot(x, y, color='blue', linestyle='--', marker='o', linewidth=2, label='Sales')
# ============ FIGURE SIZE ============
plt.figure(figsize=(10, 6)) # width, height in inches
plt.plot(x, y)
plt.show()
# ============ SUBPLOTS (MULTIPLE PLOTS) ============
fig, axes = plt.subplots(1, 2) # 1 row, 2 columns
axes[0].plot(x, y)
axes[0].set_title('Plot 1')
axes[1].bar(['A', 'B'], [10, 20])
axes[1].set_title('Plot 2')
plt.show()
# 2x2 grid
fig, axes = plt.subplots(2, 2)
axes[0, 0].plot(x, y)
axes[0, 1].bar(['A', 'B'], [10, 20])
axes[1, 0].scatter(x, y)
axes[1, 1].hist(data)
plt.tight_layout()
plt.show()
# ============ SAVE FIGURE ============
plt.plot(x, y)
plt.savefig('myplot.png')
plt.savefig('myplot.pdf')
# ============ COMMON FORMATTING ============
plt.xlabel('X Label')
plt.ylabel('Y Label')
plt.title('My Title')
plt.legend() # show legend
plt.xticks(rotation=45) # rotate x labels
plt.tight_layout() # fix layout
plt.grid(True) # add grid
plt.xlim(0, 100) # x axis limits
plt.ylim(0, 50) # y axis limits
# ============================
# PYTHON QUICK CHEAT SHEET
# Requests + GraphQL + SQLite
# ============================
# ---------- SQLite ----------
import sqlite3
# Connect / cursor
conn = sqlite3.connect("mydb.sqlite")
cur = conn.cursor()
# Create table (safe to re-run)
cur.execute("""
CREATE TABLE IF NOT EXISTS table_name (
col1 TEXT,
col2 INTEGER
)
""")
# INSERT (parameterized)
cur.execute(
"INSERT INTO table_name (col1, col2) VALUES (?, ?)",
("value", 10) # tuple matches the ? placeholders
)
conn.commit()
# INSERT many
rows = [("A", 1), ("B", 2)]
cur.executemany(
"INSERT INTO table_name (col1, col2) VALUES (?, ?)",
rows
)
conn.commit()
# SELECT with 1 parameter (NOTE the comma!)
cur.execute("SELECT * FROM table_name WHERE col1 = ?", ("A",))
print(cur.fetchall())
# SELECT with multiple parameters
cur.execute(
"SELECT * FROM table_name WHERE col2 BETWEEN ? AND ?",
(1, 10)
)
print(cur.fetchall())
# OR condition (same value)
q = "A"
cur.execute(
"SELECT * FROM table_name WHERE col1 = ? OR col2 = ?",
(q, q)
)
# IN clause (dynamic list)
ids = [1, 3, 5]
ph = ",".join(["?"] * len(ids)) # "?,?,?"
cur.execute(f"SELECT * FROM table_name WHERE col2 IN ({ph})", ids)
# Fetch methods
cur.fetchone() # one row
cur.fetchmany(5) # up to 5 rows
cur.fetchall() # all rows
# Close DB
cur.close()
conn.close()
# ---------- Requests ----------
import requests
# GET JSON
r = requests.get("https://example.com/data.json", timeout=30)
r.raise_for_status()
data = r.json()
# POST JSON
r = requests.post("https://example.com/api", json={"key": "value"})
r.raise_for_status()
# ---------- GraphQL ----------
url = "https://data.rcsb.org/graphql"
query = """
{
entries(entry_ids: ["1QK1"]) {
rcsb_id
}
}
"""
r = requests.post(url, json={"query": query}, timeout=30)
r.raise_for_status()
j = r.json()
if "errors" in j:
raise RuntimeError(j["errors"])
entries = j["data"]["entries"]