CHEAT SHEETs

# ==================== PANDAS CHEAT SHEET ====================

import pandas as pd

# ============ READING/WRITING DATA ============
df = pd.read_csv('file.csv')
df = pd.read_excel('file.xls')
df.to_csv('output.csv', index=False)
df.to_sql('table', conn, index=False)

# ============ BASIC INFO ============
df.head()          # first 5 rows
df.tail(3)         # last 3 rows
df.shape           # (rows, columns) -> (100, 5)
df.columns         # column names
df.dtypes          # data types
df.info()          # summary
df.describe()      # statistics

# ============ SELECTING DATA ============
df['col']                 # single column (Series)
df[['col1', 'col2']]      # multiple columns (DataFrame)
df.loc[0]                 # row by label/index
df.loc[0:5, 'col']        # rows 0-5, specific column
df.iloc[0:5, 0:2]         # by position (first 5 rows, first 2 cols)

# ============ FILTERING ============
df[df['age'] > 30]                          # where age > 30
df[df['country'] == 'Italy']                # where country is Italy
df[df['country'].isin(['Italy', 'Spain'])]  # where country in list
df[(df['age'] > 30) & (df['salary'] > 50000)]  # multiple conditions

# ============ UNIQUE VALUES ============
df['country'].unique()      # array of unique values -> ['Italy', 'Spain', 'France']
df['country'].nunique()     # count unique -> 3
df['country'].value_counts()  
# Italy     10
# Spain      8
# France     5

# ============ MISSING DATA ============
df.isna().sum()        # count NaN per column
df.dropna()            # remove rows with NaN
df.fillna(0)           # replace NaN with 0

# ============ GROUPBY ============
df.groupby('country')['salary'].mean()
# country
# France    45000
# Italy     52000
# Spain     48000

df.groupby('country').agg({'salary': 'mean', 'age': 'max'})
#          salary  age
# France    45000   55
# Italy     52000   60

# ============ SORTING ============
df.sort_values('salary')                    # ascending
df.sort_values('salary', ascending=False)   # descending
df.sort_values(['country', 'salary'])       # multiple columns

# ============ ADDING/MODIFYING COLUMNS ============
df['new_col'] = df['salary'] * 2
df['category'] = df['age'].apply(lambda x: 'old' if x > 50 else 'young')

# ============ RENAMING ============
df.rename(columns={'old_name': 'new_name'})

# ============ DROP ============
df.drop(columns=['col1', 'col2'])
df.drop(index=[0, 1, 2])

# ============ MERGE/JOIN ============
pd.merge(df1, df2, on='id')               # inner join
pd.merge(df1, df2, on='id', how='left')   # left join

# ============ CONCAT ============
pd.concat([df1, df2])          # stack vertically
pd.concat([df1, df2], axis=1)  # stack horizontally

# ============ pd.cut() - BINNING ============
ages = pd.Series([15, 25, 35, 45, 55])
pd.cut(ages, bins=3, labels=['young', 'mid', 'old'])
# 0    young
# 1    young
# 2      mid
# 3      mid
# 4      old

# ============ QUICK PLOTTING ============
df['salary'].plot()                    # line plot
df['salary'].plot(kind='bar')          # bar plot
df.plot(x='year', y='salary')          # x vs y
df.groupby('country')['salary'].mean().plot(kind='bar')

# ============ COMMON AGGREGATIONS ============
df['col'].sum()
df['col'].mean()
df['col'].min()
df['col'].max()
df['col'].count()
df['col'].std()

# ==================== SQLITE + PANDAS CHEAT SHEET ====================

import sqlite3
import pandas as pd

# ============ CONNECT TO DATABASE ============
conn = sqlite3.connect('database.sqlite')  # creates file if doesn't exist
conn.close()                                # always close when done

# ============ PANDAS TO SQLITE ============
conn = sqlite3.connect('mydb.sqlite')

# Write entire dataframe to SQLite table
df.to_sql('table_name', conn, index=False, if_exists='replace')

# if_exists options:
#   'fail'    - error if table exists (default)
#   'replace' - drop table and recreate
#   'append'  - add rows to existing table

conn.close()

# ============ SQLITE TO PANDAS ============
conn = sqlite3.connect('mydb.sqlite')

# Read entire table
df = pd.read_sql_query('SELECT * FROM table_name', conn)

# Read with filter
df = pd.read_sql_query('SELECT * FROM happiness WHERE year > 2015', conn)

# Read specific columns
df = pd.read_sql_query('SELECT country, year, salary FROM employees', conn)

# Read with multiple conditions
df = pd.read_sql_query('''
    SELECT * FROM happiness 
    WHERE "Log GDP per capita" > 11.2 
    AND year >= 2010
''', conn)

conn.close()

# ============ IMPORTANT: COLUMN NAMES WITH SPACES ============
# Use double quotes around column names with spaces
df = pd.read_sql_query('SELECT "Country name", "Life Ladder" FROM happiness', conn)

# ============ COMMON SQL QUERIES ============
# Count rows
pd.read_sql_query('SELECT COUNT(*) FROM table_name', conn)

# Distinct values
pd.read_sql_query('SELECT DISTINCT country FROM happiness', conn)

# Order by
pd.read_sql_query('SELECT * FROM happiness ORDER BY year DESC', conn)

# Group by with aggregation
pd.read_sql_query('''
    SELECT country, AVG(salary) as avg_salary 
    FROM employees 
    GROUP BY country
''', conn)

# ============ TYPICAL WORKFLOW ============
# 1. Read Excel/CSV
df = pd.read_excel('data.xls')

# 2. Select columns
df_subset = df[['col1', 'col2', 'col3']]

# 3. Save to SQLite
conn = sqlite3.connect('mydb.sqlite')
df_subset.to_sql('mytable', conn, index=False, if_exists='replace')
conn.close()

# 4. Later, read back with filter
conn = sqlite3.connect('mydb.sqlite')
df_filtered = pd.read_sql_query('SELECT * FROM mytable WHERE col1 > 100', conn)
conn.close()

# ============ MODIFY DATA & SAVE TO NEW DB ============
# Read from db1
conn1 = sqlite3.connect('db1.sqlite')
df = pd.read_sql_query('SELECT * FROM table1', conn1)
conn1.close()

# Modify in pandas
df['new_col'] = df['old_col'] * 10
df = df.drop(columns=['old_col'])
df = df.rename(columns={'new_col': 'better_name'})

# Save to db2
conn2 = sqlite3.connect('db2.sqlite')
df.to_sql('table1', conn2, index=False, if_exists='replace')
conn2.close()

# ============ FILE SIZE ============
import os
os.path.getsize('file.sqlite')  # size in bytes


# ==================== MATPLOTLIB CHEAT SHEET ====================

import matplotlib.pyplot as plt

# ============ BASIC LINE PLOT ============
plt.plot([1, 2, 3, 4], [10, 20, 25, 30])
plt.show()

# ============ LINE PLOT WITH LABELS ============
plt.plot([2020, 2021, 2022], [100, 150, 130])
plt.xlabel('Year')
plt.ylabel('Sales')
plt.title('Sales Over Time')
plt.show()

# ============ MULTIPLE LINES (SAME PLOT) ============
plt.plot([2020, 2021, 2022], [100, 150, 130], label='Italy')
plt.plot([2020, 2021, 2022], [90, 120, 140], label='Spain')
plt.plot([2020, 2021, 2022], [80, 110, 160], label='France')
plt.legend()  # shows the labels
plt.show()

# ============ BAR PLOT ============
plt.bar(['Italy', 'Spain', 'France'], [100, 90, 80])
plt.show()

# ============ BAR PLOT WITH OPTIONS ============
plt.bar(['Italy', 'Spain', 'France'], [100, 90, 80], color='green')
plt.title('GDP by Country')
plt.xticks(rotation=45)  # rotate x labels
plt.tight_layout()       # prevent labels from cutting off
plt.show()

# ============ HORIZONTAL BAR ============
plt.barh(['Italy', 'Spain', 'France'], [100, 90, 80])
plt.show()

# ============ SCATTER PLOT ============
plt.scatter([1, 2, 3, 4], [10, 20, 15, 30])
plt.show()

# ============ HISTOGRAM ============
data = [1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 5]
plt.hist(data, bins=5)
plt.show()

# ============ PIE CHART ============
plt.pie([30, 40, 30], labels=['A', 'B', 'C'])
plt.show()

# ============ PLOT FROM PANDAS DIRECTLY ============
df['salary'].plot()                      # line
df['salary'].plot(kind='bar')            # bar
df.plot(x='year', y='salary')            # x vs y
df.plot(x='year', y='salary', kind='scatter')

# ============ GROUPBY + PLOT ============
df.groupby('country')['salary'].mean().plot(kind='bar')
plt.title('Average Salary by Country')
plt.show()

# ============ MULTIPLE LINES FROM DATAFRAME ============
countries = ['Italy', 'Spain', 'France']
for country in countries:
    data = df[df['country'] == country]
    plt.plot(data['year'], data['value'], label=country)
plt.legend()
plt.show()

# ============ STYLING OPTIONS ============
plt.plot(x, y, color='red')              # color
plt.plot(x, y, linestyle='--')           # dashed line
plt.plot(x, y, marker='o')               # dots on points
plt.plot(x, y, linewidth=2)              # thicker line

# Combined:
plt.plot(x, y, color='blue', linestyle='--', marker='o', linewidth=2, label='Sales')

# ============ FIGURE SIZE ============
plt.figure(figsize=(10, 6))  # width, height in inches
plt.plot(x, y)
plt.show()

# ============ SUBPLOTS (MULTIPLE PLOTS) ============
fig, axes = plt.subplots(1, 2)  # 1 row, 2 columns
axes[0].plot(x, y)
axes[0].set_title('Plot 1')
axes[1].bar(['A', 'B'], [10, 20])
axes[1].set_title('Plot 2')
plt.show()

# 2x2 grid
fig, axes = plt.subplots(2, 2)
axes[0, 0].plot(x, y)
axes[0, 1].bar(['A', 'B'], [10, 20])
axes[1, 0].scatter(x, y)
axes[1, 1].hist(data)
plt.tight_layout()
plt.show()

# ============ SAVE FIGURE ============
plt.plot(x, y)
plt.savefig('myplot.png')
plt.savefig('myplot.pdf')

# ============ COMMON FORMATTING ============
plt.xlabel('X Label')
plt.ylabel('Y Label')
plt.title('My Title')
plt.legend()                    # show legend
plt.xticks(rotation=45)         # rotate x labels
plt.tight_layout()              # fix layout
plt.grid(True)                  # add grid
plt.xlim(0, 100)                # x axis limits
plt.ylim(0, 50)                 # y axis limits

# ============================
# PYTHON QUICK CHEAT SHEET
# Requests + GraphQL + SQLite
# ============================

# ---------- SQLite ----------
import sqlite3

# Connect / cursor
conn = sqlite3.connect("mydb.sqlite")
cur = conn.cursor()

# Create table (safe to re-run)
cur.execute("""
CREATE TABLE IF NOT EXISTS table_name (
  col1 TEXT,
  col2 INTEGER
)
""")

# INSERT (parameterized)
cur.execute(
  "INSERT INTO table_name (col1, col2) VALUES (?, ?)",
  ("value", 10)              # tuple matches the ? placeholders
)
conn.commit()

# INSERT many
rows = [("A", 1), ("B", 2)]
cur.executemany(
  "INSERT INTO table_name (col1, col2) VALUES (?, ?)",
  rows
)
conn.commit()

# SELECT with 1 parameter (NOTE the comma!)
cur.execute("SELECT * FROM table_name WHERE col1 = ?", ("A",))
print(cur.fetchall())

# SELECT with multiple parameters
cur.execute(
  "SELECT * FROM table_name WHERE col2 BETWEEN ? AND ?",
  (1, 10)
)
print(cur.fetchall())

# OR condition (same value)
q = "A"
cur.execute(
  "SELECT * FROM table_name WHERE col1 = ? OR col2 = ?",
  (q, q)
)

# IN clause (dynamic list)
ids = [1, 3, 5]
ph = ",".join(["?"] * len(ids))       # "?,?,?"
cur.execute(f"SELECT * FROM table_name WHERE col2 IN ({ph})", ids)

# Fetch methods
cur.fetchone()     # one row
cur.fetchmany(5)   # up to 5 rows
cur.fetchall()     # all rows

# Close DB
cur.close()
conn.close()


# ---------- Requests ----------
import requests

# GET JSON
r = requests.get("https://example.com/data.json", timeout=30)
r.raise_for_status()
data = r.json()

# POST JSON
r = requests.post("https://example.com/api", json={"key": "value"})
r.raise_for_status()


# ---------- GraphQL ----------
url = "https://data.rcsb.org/graphql"
query = """
{
  entries(entry_ids: ["1QK1"]) {
    rcsb_id
  }
}
"""

r = requests.post(url, json={"query": query}, timeout=30)
r.raise_for_status()
j = r.json()

if "errors" in j:
    raise RuntimeError(j["errors"])

entries = j["data"]["entries"]

Bioinformatics Forever

CHEAT SHEETs