Database, SQL & PostgreSQL
where data lives. learn this well.
what is a database
a database is an organized collection of data stored and accessed electronically. a DBMS (Database Management System) is the software that manages it.
why not just use files? - files don’t support concurrent access (two users writing = corruption) - no way to query data efficiently - no transactions (partial writes = corrupted data) - no relationships between data - no access control
types of databases
relational (SQL) data in tables with rows and columns, relationships via foreign keys PostgreSQL, MySQL, SQLite, Oracle, SQL Server
document data as JSON-like documents — MongoDB, CouchDB, Firestore
key-value simple key to value store — Redis, DynamoDB
column-family data stored by column, good for analytics — Cassandra, HBase
graph nodes and edges — Neo4j, Amazon Neptune
time-series optimized for time-stamped data — InfluxDB, TimescaleDB
relational concepts
table stores data in rows and columns row is one entry in a table column is one property of a record schema is the structure definition of the database primary key uniquely identifies each row foreign key references primary key of another table index speeds up queries on a column view is a virtual table based on a query trigger is code that runs automatically on events
SQL basics
SQL (Structured Query Language) is the language for relational databases.
CREATE TABLE
CREATE TABLE users (
id SERIAL PRIMARY KEY,
username VARCHAR(50) UNIQUE NOT NULL,
email VARCHAR(255) UNIQUE NOT NULL,
password VARCHAR(255) NOT NULL,
age INT CHECK (age > 0),
bio TEXT,
created_at TIMESTAMP DEFAULT NOW()
);
CREATE TABLE posts (
id SERIAL PRIMARY KEY,
user_id INT NOT NULL REFERENCES users(id) ON DELETE CASCADE,
title VARCHAR(255) NOT NULL,
content TEXT,
published BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT NOW()
);data types
-- numbers
INT / INTEGER -- whole number
BIGINT -- large whole number
DECIMAL(10, 2) -- exact decimal (money)
FLOAT / REAL -- approximate decimal
SERIAL -- auto-incrementing integer
-- text
CHAR(n) -- fixed length string
VARCHAR(n) -- variable length, max n
TEXT -- unlimited length
-- date/time
DATE -- date only (2026-03-05)
TIME -- time only (14:30:00)
TIMESTAMP -- date + time
TIMESTAMPTZ -- timestamp with timezone
INTERVAL -- time interval
-- other
BOOLEAN -- true/false
UUID -- universally unique identifier
JSON / JSONB -- JSON data (JSONB = binary, indexed)INSERT
INSERT INTO users (username, email, password)
VALUES ('abhishek', 'a@b.com', 'hashed_pw');
-- multiple rows
INSERT INTO users (username, email, password) VALUES
('user1', 'u1@b.com', 'pw1'),
('user2', 'u2@b.com', 'pw2');
-- insert and return
INSERT INTO users (username, email, password)
VALUES ('abhishek', 'a@b.com', 'pw')
RETURNING id, created_at;SELECT
SELECT * FROM users;
SELECT id, username, email FROM users;
SELECT username AS name FROM users;
SELECT DISTINCT country FROM users;
-- WHERE
SELECT * FROM users WHERE age > 18;
SELECT * FROM users WHERE age BETWEEN 18 AND 30;
SELECT * FROM users WHERE username IN ('a', 'b', 'c');
SELECT * FROM users WHERE bio IS NULL;
SELECT * FROM users WHERE username LIKE 'ab%';
SELECT * FROM users WHERE username ILIKE 'ab%'; -- case insensitive
-- AND / OR / NOT
SELECT * FROM users WHERE age > 18 AND country = 'India';
SELECT * FROM users WHERE age < 18 OR age > 60;
-- ORDER BY
SELECT * FROM users ORDER BY created_at DESC;
SELECT * FROM users ORDER BY age ASC, username DESC;
-- LIMIT and OFFSET (pagination)
SELECT * FROM users LIMIT 10;
SELECT * FROM users LIMIT 10 OFFSET 20; -- page 3UPDATE
UPDATE users SET age = 21 WHERE id = 1;
UPDATE users
SET age = 21, bio = 'developer'
WHERE id = 1;
UPDATE users SET age = 21 WHERE id = 1 RETURNING *;DELETE
DELETE FROM users WHERE id = 1;
DELETE FROM users WHERE created_at < '2024-01-01';
TRUNCATE users; -- delete all rows, faster than DELETEaggregate functions
SELECT COUNT(*) FROM users;
SELECT SUM(price) FROM orders;
SELECT AVG(age) FROM users;
SELECT MIN(price), MAX(price) FROM products;
SELECT ROUND(AVG(age), 2) FROM users;
-- GROUP BY
SELECT country, COUNT(*) as user_count
FROM users
GROUP BY country
ORDER BY user_count DESC;
-- HAVING (filter groups — like WHERE but for groups)
SELECT country, COUNT(*) as user_count
FROM users
GROUP BY country
HAVING COUNT(*) > 100;
-- WHERE filters rows, HAVING filters groups
SELECT country, COUNT(*) as count
FROM users
WHERE age > 18
GROUP BY country
HAVING COUNT(*) > 10;JOINs
joins combine rows from multiple tables.
users: posts:
id | name id | user_id | title
1 | abhishek 1 | 1 | first post
2 | rahul 2 | 1 | second post
3 | priya 3 | 2 | rahul's post
INNER JOIN — only matching rows
SELECT u.name, p.title
FROM users u
INNER JOIN posts p ON p.user_id = u.id;
-- priya not included (no posts)LEFT JOIN — all from left, matching from right
SELECT u.name, p.title
FROM users u
LEFT JOIN posts p ON p.user_id = u.id;
-- priya | NULL (included even with no posts)RIGHT JOIN — all from right, matching from left
SELECT u.name, p.title
FROM users u
RIGHT JOIN posts p ON p.user_id = u.id;FULL OUTER JOIN — all from both tables
SELECT u.name, p.title
FROM users u
FULL OUTER JOIN posts p ON p.user_id = u.id;multiple joins
SELECT u.username, p.title, c.content as comment
FROM users u
JOIN posts p ON p.user_id = u.id
JOIN comments c ON c.post_id = p.id
WHERE u.id = 1;subqueries
-- in WHERE
SELECT * FROM users
WHERE id IN (SELECT user_id FROM posts WHERE published = true);
-- in FROM
SELECT avg_age, country
FROM (
SELECT country, AVG(age) as avg_age
FROM users
GROUP BY country
) AS country_stats
WHERE avg_age > 25;
-- EXISTS
SELECT * FROM users u
WHERE EXISTS (
SELECT 1 FROM posts p WHERE p.user_id = u.id
);
SELECT * FROM users u
WHERE NOT EXISTS (
SELECT 1 FROM posts p WHERE p.user_id = u.id
);CTEs (Common Table Expressions)
named temporary result sets — cleaner than nested subqueries
WITH active_users AS (
SELECT * FROM users WHERE last_login > NOW() - INTERVAL '30 days'
)
SELECT * FROM active_users WHERE country = 'India';
-- multiple CTEs
WITH
active_users AS (
SELECT id, username FROM users WHERE active = true
),
user_posts AS (
SELECT user_id, COUNT(*) as post_count
FROM posts
GROUP BY user_id
)
SELECT u.username, COALESCE(p.post_count, 0) as posts
FROM active_users u
LEFT JOIN user_posts p ON p.user_id = u.id
ORDER BY posts DESC;window functions
like GROUP BY but doesn’t collapse rows
-- ROW_NUMBER
SELECT username, age,
ROW_NUMBER() OVER (ORDER BY age DESC) as rank
FROM users;
-- RANK (1,2,2,4 — skips) vs DENSE_RANK (1,2,2,3 — no skip)
SELECT username, score,
RANK() OVER (ORDER BY score DESC) as rank,
DENSE_RANK() OVER (ORDER BY score DESC) as dense_rank
FROM leaderboard;
-- PARTITION BY (rank within group)
SELECT username, country, age,
RANK() OVER (PARTITION BY country ORDER BY age DESC) as rank_in_country
FROM users;
-- LAG and LEAD (previous/next row)
SELECT date, revenue,
LAG(revenue) OVER (ORDER BY date) as prev_revenue,
revenue - LAG(revenue) OVER (ORDER BY date) as change
FROM daily_revenue;
-- running total
SELECT date, revenue,
SUM(revenue) OVER (ORDER BY date) as running_total
FROM daily_revenue;indexes
speed up queries by avoiding full table scans
CREATE INDEX idx_users_email ON users(email);
CREATE UNIQUE INDEX idx_users_username ON users(username);
-- composite index (order matters)
CREATE INDEX idx_orders_user_date ON orders(user_id, created_at);
-- partial index (only index subset)
CREATE INDEX idx_active_users ON users(email) WHERE active = true;
-- expression index
CREATE INDEX idx_lower_email ON users(LOWER(email));
DROP INDEX idx_users_email;
-- check what indexes exist
SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'users';
-- analyze query
EXPLAIN SELECT * FROM users WHERE email = 'a@b.com';
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'a@b.com';when to index: - columns in WHERE, JOIN, ORDER BY, GROUP BY - foreign keys - columns frequently searched/filtered
when NOT to: - small tables (full scan is fine) - columns rarely used in queries - tables with very heavy writes (indexes slow inserts)
transactions
operations that succeed or fail together (all or nothing)
BEGIN;
UPDATE accounts SET balance = balance - 1000 WHERE id = 1;
UPDATE accounts SET balance = balance + 1000 WHERE id = 2;
COMMIT;
-- rollback on error
BEGIN;
UPDATE accounts SET balance = balance - 1000 WHERE id = 1;
-- error here
ROLLBACK;
-- savepoints
BEGIN;
INSERT INTO orders (user_id, total) VALUES (1, 500);
SAVEPOINT order_created;
INSERT INTO order_items (order_id, product_id) VALUES (1, 10);
-- if error:
ROLLBACK TO order_created;
COMMIT;ACID: - Atomicity — all or nothing - Consistency — data always valid - Isolation — transactions don’t interfere - Durability — committed data survives crashes
normalization
organizing tables to reduce redundancy.
1NF — each column has single values, no repeating groups 2NF — 1NF + every column depends on whole primary key 3NF — 2NF + no column depends on another non-key column
example of 3NF violation:
orders: order_id | customer_id | customer_city
customer_city depends on customer_id, not order_id
fixed:
orders: order_id | customer_id
customers: customer_id | customer_city
PostgreSQL specific
connect
psql -U postgres -d mydb
psql "postgresql://user:password@host:5432/dbname"psql commands
\l list databases
\c dbname connect to database
\dt list tables
\d table describe table
\du list users
\timing toggle timing
\q quit
PostgreSQL extras
-- UUID
id UUID DEFAULT gen_random_uuid()
-- Arrays
tags TEXT[] DEFAULT '{}'
SELECT * FROM posts WHERE 'tech' = ANY(tags);
SELECT * FROM posts WHERE tags @> ARRAY['tech', 'python'];
-- JSONB
data JSONB
SELECT data->>'name' FROM users;
SELECT data->'address'->>'city' FROM users;
SELECT * FROM users WHERE data @> '{"role":"admin"}';
CREATE INDEX idx_data ON users USING GIN(data);
-- upsert
INSERT INTO users (email, name)
VALUES ('a@b.com', 'abhishek')
ON CONFLICT (email)
DO UPDATE SET name = EXCLUDED.name, updated_at = NOW();
ON CONFLICT DO NOTHING;
-- date functions
NOW()
CURRENT_DATE
DATE_TRUNC('month', created_at)
EXTRACT(YEAR FROM created_at)
created_at + INTERVAL '7 days'
AGE(created_at)
created_at AT TIME ZONE 'Asia/Kolkata'
-- conditional
CASE
WHEN age < 18 THEN 'minor'
WHEN age < 60 THEN 'adult'
ELSE 'senior'
END
COALESCE(bio, 'no bio') -- first non-null value
NULLIF(value, 0) -- return null if equal to second arg
-- full text search
SELECT * FROM posts
WHERE to_tsvector('english', content) @@ to_tsquery('python & django');
CREATE INDEX idx_fts ON posts USING GIN(to_tsvector('english', content));backup and restore
pg_dump mydb > backup.sql
pg_dump -Fc mydb > backup.dump
psql mydb < backup.sql
pg_restore -d mydb backup.dumpquery optimization tips
-- see query plan
EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'a@b.com';
-- look for:
-- Seq Scan full table scan (slow on big tables)
-- Index Scan using index (good)
-- Index Only only reads index (best)
-- update stats so planner makes better decisions
ANALYZE users;
VACUUM ANALYZE users;
-- check table size
SELECT pg_size_pretty(pg_total_relation_size('users'));
-- find slow queries
SELECT query, mean_exec_time, calls
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;common design patterns
soft delete
ALTER TABLE users ADD COLUMN deleted_at TIMESTAMP;
-- soft delete
UPDATE users SET deleted_at = NOW() WHERE id = 1;
-- query active only
SELECT * FROM users WHERE deleted_at IS NULL;
-- partial index for performance
CREATE INDEX idx_active ON users(id) WHERE deleted_at IS NULL;audit trail
CREATE TABLE users_audit (
id SERIAL PRIMARY KEY,
user_id INT,
operation CHAR(1), -- I/U/D
changed_at TIMESTAMP DEFAULT NOW(),
old_data JSONB,
new_data JSONB
);pagination patterns
-- offset pagination (simple but slow on large offsets)
SELECT * FROM posts ORDER BY created_at DESC LIMIT 20 OFFSET 100;
-- cursor pagination (fast, consistent)
SELECT * FROM posts
WHERE created_at < '2026-03-01T10:00:00'
ORDER BY created_at DESC
LIMIT 20;=^._.^= the database is the source of truth