Skip to content

The goal of PythonUniverse is to collect and organize Python skills in one place. We've collected some Python techniques, Python modules, and some libraries for data science. Each Topic includes code snippet and further explanation.

License

Notifications You must be signed in to change notification settings

windsuzu/PythonUniverse

Folders and files

NameName
Last commit message
Last commit date

Latest commit

Β 

History

60 Commits
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 

Repository files navigation

Table of contents

Must KnowClassesFunctions
CollectionsItertoolsFunctools
StringIntSetTuple
ConditionalFor-LoopTry-ExceptDesignIpython
Built-ins
NumpyPandasMatplotlib (Pyplot)
Seaborn

Must Know

List & Dict & Set Comprehensions

[(i, j) for i in range(3) for j in range(3) if i > j]

# [(1, 0), (2, 0), (2, 1)]

Lambda Functions

li = [1, 2, 3]
li = [*map(lambda x: x * 10, li)]

#li = [10, 20, 30]

Map

num1 = [100, 1, 20]
num2 = [19, 4, 94]
num3 = [40, 6, 30]

[*map(lambda x, y, z: max(x, y, z), num1, num2, num3)]
# [100, 6, 94]

Filter

names = ['Liam', 'Olivia', 'Noah', 'Emma', 'Oliver', 'Ava']
choice = filter(lambda x: x.startswith('O'), names)

print(*choice, sep=', ') # Olivia, Oliver

Zip

a = [1, 2, 3]
b = [4, 5, 6]

c = [*zip(a, b)]  # [(1, 4), (2, 5), (3, 6)]
a, b = zip(*c)    # a=(1, 2, 3),  b=(4, 5, 6)

*args & **kwargs

Defining Functions with *arg and **kwarg

def example(a, *arg, b=0, **kwarg):
    print(a)     # 1
    print(arg)   # (2, 3)
    print(b)     # 1
    print(kwarg) # {'x': 'a', 'y': [1, 2, 3]}

example(1, 2, 3, b=1, x='a', y=[1, 2, 3])

Calling Functions with *arg and **kwarg

def func(greet, time, name):
    print(greet, time, name)

func(*["Good", "Morning"], **{"name": "Jay"})
# Good Morning Jay

Unpack Variables

Unpacking Iterable

a, b, *_ = [1, 2, 3, 4, 5]
# 1, 2, [3, 4, 5]

Unpacking Generator

first, *amid, last = map(lambda x: x**2, range(1, 10000))
first  # 1
last   # 99980001

Unpacking in For-loop

sales = [("Pencil", 0.22, 1500), ("Notebook", 1.30, 550)]

for product, *_ in sales:
    print(product)
    # Pencil, Notebook

Unpacking Function

def compute(i):
    return i, i ** 2, i ** 3, i ** 4, i ** 5

num, power, cube, *_ = compute(3)
power  # 9
cube   # 27

Combining Dicts

number = {"one": 1, "two": 2}
letter = {"a": "A", "b": "B"}

combine = {**number, **letter}
combine  # {'one': 1, 'two': 2, 'a': 'A', 'b': 'B'}

Generator (map, filter, zip)

def square_it(value):
    for i in range(value):
        yield i**2

li = square_it(10_000_000)

[i for i in li if i < 50]  # [0, 1, 4, 9, 16, 25, 36, 49]

Closure & Decorator

def count_decorator(count):  # new decorator with argument
    def decorator(orig_func):
        def wrapper(*args, **kwargs):
            print(f"func name: {orig_func.__name__}")
            print(f"func args: {args}, {kwargs}")

            for _ in range(count):  # use the argument
                orig_func(*args, **kwargs)

        return wrapper
    return decorator  # return the original decorator

@count_decorator(2)
def greet(msg):
    print(msg)


greet("hello")
# func name: greet
# func args: ('hello',), {}
# hello
# hello

Context Manager

@contextmanager
def enterFolder(folderName):
    home = os.getcwd()
    os.chdir(folderName)
    yield
    os.chdir(home)

with enterFolder('folder1'), open('example1.txt', 'w') as f:
    f.write('file1')

Magic Method

class BinaryInt(str):
    def __new__(cls, val):
        return str.__new__(cls, f"{val: b}")

    def __add__(self, val):
        val += int(self, 2)
        return f"{val:b}"


a = BinaryInt(2)
print(a)      # 10
print(a + 4)  # 110

Metaclasses

class Meta(type):
    def __new__(mtcls, name, bases, attrs):
        if name != "Base" and "must_to_do" not in attrs:
            raise TypeError("Bad Class: must_to_do() is needed")
        return super().__new__(mtcls, name, bases, attrs)


class Base(metaclass=Meta):
    def server_func(self):
        return self.must_to_do()


class Derived(Base):
    ...
# TypeError: Bad Class: must_to_do() is needed

Threading & Multiprocessing

import concurrent.futures

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(load_url, url, 60) for url in URLS]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        print(len(result))


with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(load_url, URLS, [60] * len(URLS), chunksize=4)
    for result in results:
        print(len(result))

Classes

self (class instance)

class Person:
    def __init__(self, name):
        self.name = name

    def say(self):
        return f"I'm {self.name}"


p = Person("Jay")
p.say() == Person.say(p)  # True

variables (class & instance)

class Employee:
    num_emp = 0  # Class variable

    def __init__(self, pay):
        self.pay = pay  # Instance variable
        Employee.num_emp += 1

e1 = Employee(100)
e2 = Employee(200)

e1.num_emp        # 2
Employee.num_emp  # 2

e1.pay  # 100
Employee.pay  # AttributeError: type object 'Employee' has no attribute 'pay'

method vs. classmethod vs. staticmethod

class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age

    @staticmethod
    def splitPersonString(string, split_sign="-"):
        return string.split(split_sign)

    @classmethod
    def fromString(cls, cls_str):
        return cls(*cls.splitPersonString(cls_str, ", "))


p1 = Person.fromString("Jay, 99")
p1.name  # Jay
p1.age   # 99

_ (private) vs. __ (name mangling)

class Dog:
    _weight = 5  # private variable

    def __bark(self):  # name mangling fucntion
        print("bark")


dog = Dog()
dog._weight       # 5
dog.__bark()      # AttributeError: 'Dog' object has no attribute '__bark'
dog._Dog__bark()  # bark

@property (getter, setter)

class User:
    def __init__(self, first_name, last_name, password):
        self.first_name = first_name
        self.last_name = last_name
        self.password = password

    @property
    def fullname(self):
        return f"{self.first_name} {self.last_name}"

    @property
    def password(self):
        raise AttributeError("password is not readable.")

    @password.setter
    def password(self, passord):
        from hashlib import md5

        self.password_hash = md5(b"{password}").hexdigest()


user = User("Mimi", "Wang", "0000")
user.fullname       # Mimi Wang
user.password_hash  # 7fbccc9c3a9a5afef65563cd00404c1416
user.password       # Attribute Error: password is not readable.

LEGB (local, enclosing, global, builtins)

min([1, 2, 31])  # builtins min
min = "global min"

def outer():
    # we can do "global min" here to change global
    min = "enclosing min"
    
    def inner():
        # we can do "nonlocal min" here to change enclosing
        min = "local min"

Abstract class

from abc import ABC, abstractmethod


class Base(ABC, object):
    @property
    @abstractmethod
    def foo(self):
        ...

    @abstractmethod
    def do(self):
        ...

Dataclasses

from dataclasses import InitVar, dataclass, field
from typing import List


@dataclass
class InventoryItem:
    name: str
    unit_price: float = field(default=0.0)
    quantity_on_hand: int = field(default=0, repr=False)
    parts: List[str] = field(default_factory=list)
    parts_number: InitVar[int] = 0

    def __post_init__(self, parts_number):
        self.parts.extend([f"part{i}" for i in range(1, parts_number + 1)])


item = InventoryItem("product", parts_number=2)
# InventoryItem (name = 'product', unit_price=0.0, parts=['part1', 'part2'])

Classes in Dynamic Language

def getClass(x):
    if x == 1:
        for i in range(11):

            class Example:
                a = i

        return Example


cls = getClass(1)
cls.b = "123"
print(cls.a, cls.b)  # 10 123

Functions

Enclosing function

def add_with_b(b):
    def add(a):
        return a + b
    return add

add4 = add_with_b(4)
add4(3)  # 7
add4(7)  # 11

Attrs

class Cat:
    def __repr__(self):
        return f"({self.name}: {self.age})"

listOfCats = []
attrs = [{"name": "meow1", "age": 5}, {"name": "meow2", "age": 10}]

for attr in attrs:
    cat = Cat()
    for key, val in attr.items():
        setattr(cat, key, val)
    listOfCats.append(cat)


print(listOfCats)
# [(meow1: 5), (meow2: 10)]

Functions in Dynamic Language

for i in range(100):
    def say():
        print(i)


def returnFunc(a):
    if a < 100:
        def mul(b):
            print(a * b)
        return mul
    else:
        def add(b):
            print(a + b)
        return add

Collections

defaultdict

from collections import defaultdict

d = defaultdict(list)
d["a"] = [1, 2, 3]
d["b"].append(4)
d["c"].extend([5, 6])

# defaultdict(<class 'list'>, {'a': [1, 2, 3], 'b': [4], 'c': [5, 6]})

OrderedDict

from collections import OrderedDict

location = ["C", "B", "A"]
population = [32, 46, 12]

d = OrderedDict({l: p for l, p in zip(location, population)})
# OrderedDict([('C', 32), ('B', 46), ('A', 12)])

d["D"] = 44
# OrderedDict([('C', 32), ('B', 46), ('A', 12), ('D', 44)])

d.popitem(last=False)
# OrderedDict([('B', 46), ('A', 12), ('D', 44)])

d.move_to_end("D", last=False)
# OrderedDict ([( 'D', 44), ('B', 46), ('A', 12)])

Counter

from collections import Counter

c = Counter(cats=4, dogs=8)
# Counter({'dogs': 8, 'cats': 4})

c.update(birds=10)
# Counter({'birds': 10, 'dogs': 8, 'cats': 4})

c = c - Counter({"birds": 5})
# Counter({'dogs': 8, 'birds': 5, 'cats': 4})

c.most_common(2)
# [('dogs', 8), ('birds', 5)]

namedtuple

from collections import namedtuple

Dog = namedtuple("Dog", "name, age")
d1 = Dog("funny", 4)

features = ["happy", 3]
d2 = Dog._make(features)
# Dog(name='happy', age=3)

d2._asdict()
# OrderedDict([('name', 'happy'), ('age', 3)])

deque

from collections import deque

li = [40, 30, 50, 46, 39, 44]
d = deque(li[:2])

# Let 's compute the moving average with range=3
d.appendleft(0)
s = sum(d)

for elem in li[2:]:
    s += elem - d.popleft()
    d.append(elem)
    print(s / 3)
    # 40, 42, 45, 43

Itertools

Infinite iterators

count

from itertools import count 

gen = count(2.5, 0.5)

for x in gen:
    print(x)
    # 2.5, 3.0, 3.5, 4.0, ... non-stop

cycle

from itertools import cycle 

gen = cycle([1, 2, 3])

for x in gen:
    print(x)
    # 1, 2, 3, 1, 2, ... non-stop

repeat

from itertools import repeat 

class Cat:
    ...
    
gen = repeat(Cat(), 2)

for cat in gen:
    print(cat)
    # <__main__.Cat object at 0x0000019AC1C5D348>
    # <__main__.Cat object at 0x0000019AC1C5D348>

Iterators terminating on the shortest input sequence

accumulate

import operator
from itertools import accumulate

gen = accumulate([1, 2, 3, 4])
list(gen)  # [1, 3, 6, 10]


gen = accumulate([1, 2, 3, 4], func=operator.mul)
list(gen)  # [1, 2, 6, 24]

chain

from itertools import chain

gen = chain([1, 2], [3, 4])
list(gen)  # [1, 2, 3, 4]


gen = chain("AB", "CD")
list(gen)  # [A, B, C, D]

compress

from itertools import compress

gen = compress([1, 2, 3], [1, 0, 1])
gen = compress([1, 2, 3], [True, False, True])  # same

list(gen)  # [1, 3]

filterfalse

from itertools import filterfalse

gen = filterfalse(lambda x: x%2 == 0, [1, 2, 3])

list(gen)  # [1, 3]

groupby

from itertools import groupby

gen = groupby("AABBCCCAA")  # default func = lambda x: x
for k, g in gen:
    print(k, list(g))
    # A [A, A]
    # B [B, B]
    # C [C, C, C]
    # A [A, A]


gen = groupby([1, 2, 3, 4], lambda x: x // 3)
for k, g in gen:
    print(k, list(g))
    # 0 [1, 2]
    # 1 [3, 4]


gen = groupby([("A", 100), ("B", 200), ("C", 600)], lambda x: x[1] > 500)
for k, g in gen:
    print(k, list(g))
    # False [(A, 100), (B, 200)]
    # True  [(C, 600)]

islice

gen = islice([1, 2, 3], 2)  # equals to A[:2]
list(gen)  # [1, 2]


gen = islice("ABCD", 2, 4)  # equals to A[2:4]
list(gen)  # [C, D]


gen = islice("ABCD", 0, None, 2)  # equals to A[::2]
list(gen)  # [A, C]

starmap

from itertools import starmap

# with only one argument
gen = starmap(lambda x: x.lower(), "ABCD")
list(gen)  # [a, b, c, d]


# with 2 arguments
gen = starmap(lambda x, y: x + y, [(1, 2), (3, 4)])
list(gen)  # [3, 7]


# with different size of arugments
gen = starmap(lambda *keys: sum(keys) / len(keys), [[3, 8, 3], [4, 2]])
list(gen)  # [4.6666667, 3.0]

takewhile

from itertools import takewhile

gen = takewhile(lambda x: x < 2, [1, 2, 3, 2, 1])
list(gen)  # [1]

gen = takewhile(lambda x: x.isupper(), "ABCdefgHIJ")
list(gen)  # [A, B, C]

dropwhile

gen = dropwhile(lambda x: x < 2, [1, 2, 3, 2, 1])
list(gen)  # [2, 3, 2, 1]


gen = dropwhile(lambda x: x.isupper(), "ABCdefgHIJ")
list(gen)  # [d, e, f, g, H, I, J]

zip_longest

from itertools import zip_longest

gen = zip_longest("ABC", ("X", "Y"))
list(gen)  # [('A', 'X'), ('B', 'Y'), ('C', None)]


gen = zip_longest("ABC", [1, 2], fillvalue=-1)
list(gen)  # [('A', 1), ('B', 2), ('C', -1)]

Combinatoric iterators

product

from itertools import product

gen = product("AB", "CD")
list(gen)  # [AC, AD, BC, BD]


gen = product("AB", repeat=2)
list(gen)  # [AA, AB, BA, BB]


gen = product("AB", "CD", repeat=2)
list(gen)
# [ACAC, ACAD, ACBC, ACBD,
#  ADAC, ADAD, ADBC, ADBD,
#  BCAC, BCAD, BCBC, BCBD,
#  BDAC, BDAD, BDBC, BDBD]

permutations

gen = permutations("ABC") # same as r=3
list(gen)  # [ABC, ACB, BAC, BCA, CAB, CBA]


gen = permutations("ABC", r=2)
list(gen)  # [AB, AC, BA, BC, CA, CB]


gen = permutations("ABC", r=1)
list(gen)  # [A, B, C]

combinations

gen = combinations("ABC", 1)
list(gen)
# [A, B, C]


gen = combinations("ABC", 2)
list(gen)
# [AB, AC, BC]


gen = combinations("ABC", 3)
list(gen)
# [ABC]

combinations_with_replacement

gen = combinations_with_replacement("ABC", 1)
list(gen)
# [A, B, C]


gen = combinations_with_replacement("ABC", 2)
list(gen)
# [AA, AB, AC, 
#  BB, BC, 
#  CC]


gen = combinations_with_replacement("ABC", 3)
list(gen)
# [AAA, AAB, AAC, ABB, ABC, ACC,
#  BBB, BBC, BCC,
#  CCC]

Functools

Reduce

from functools import reduce

reduce(lambda x, y: x - y, [1, 2, 3, 4, 5], 100)  # 85

String

f-string

first_name = "Kain"
last_name = "Mccarthy"
print(f"Hi, I'm {first_name} {last_name}.")  # Hi, I'm Kain Mccarthy.


pi = 3.14159265359
print(f"{pi:.2f}")  # 3.14


d = {"name": "Shelly"}
print(f"She is {d['name']}")  # She is Shelly


i = 1000000
print(f"{i:,}")  # 1,000,000


# Ref:
#   * https://youtu.be/nghuHvKLhJA
#   * https://blog.louie.lu/2017/08/08/outdate-python-string-format-and-fstring/

Int

Underscore Placeholders

a = 100_000_000
b =  10_000_000
c =  1_0_0

print(f"{a+b+c:,}")  # 110,000,100


# Ref:
#   * https://youtu.be/C-gEQdGVXbk&t=140

Set

Search

long_list = [i for i in range(100_000_000)]
long_set = set(long_list)

%%time
100_000_000 in long_list
# False
# Wall time: 1.26 s


%%time
100_000_000 in long_set
# False
# Wall time: 0 ns


# Ref:
#   * https://stackoverflow.com/questions/2831212/python-sets-vs-lists/17945009
#   * https://youtu.be/r3R3h5ly_8g?t=1010

Tuple

Swap

a, b = 1, 2
a  # 1
b  # 2

a, b = b, a
a  # 2
b  # 1


# Ref:
#   * https://youtu.be/VBokjWj_cEA?list=LL&t=445

Condition

Ternary operator

if x < 1:
    x += 1
else:
    x -= 1

# equivalent to:

x = (x + 1) if (x < 1) else (x - 1)


# Ref:
#   * https://www.youtube.com/watch?v=C-gEQdGVXbk&t=34s

For-Loop

Enumerate

arr = ["a", "b", "c"]

for index, element in enumerate(arr):
    print(index, element)
    # 0 a
    # 1 b
    # 2 c

for index, element in enumerate(arr, start=3):
    print(index, element)
    # 3 a
    # 4 b
    # 5 c


# Ref
#   * https://youtu.be/VBokjWj_cEA?list=LL&t=190

For-Else

for text in "to be or not to be".split():
    if text.strip().startswith("o"):
        print(f"Found it! `{text}`")
        break
else:
    print("Not found")

# Found it! `or`


# Ref:
#   * https://www.youtube.com/watch?v=Dh-0lAyc3Bc

Try-Except

TEEF

try:
    print(1/1)
except Exception as e:
    print(e)
else:
    print("Safe")  # executed when except didn't happen
finally:
    print("Done")  # Always executed

# 1.0
# Safe
# Done


# Ref:
#   * https://youtu.be/VBokjWj_cEA?list=LL&t=1331

Design

Annotation

def func(a: str, b: int = 3) -> str:
    return a*b

func.__annotations__  # {'a': <class 'str'>, 'b': <class 'int'>, 'return': <class 'str'>}

func("hi")     # hihihi
func("hi", 5)  # hihihihihi
def func(a: "str longer than 5", b: 1+2 = 3) -> "str longer b times":
    return a*b

func.__annotations__  # {'a': 'str longer than 5', 'b': 3, 'return': 'str longer b times'}

func("hi")         # hihihi
func("ohayou", 2)  # ohayouohayou

Ref

Typing

from typing import Any, Dict, Iterable, List, Union


def func(a: List[int], b: Union[str, int], c: Dict[str, int], d: Iterable, e: Any):
    print(len(a))
    
    print(f"{b} can be str or int.")
    
    print(f"{c['something']} will return int.")
    
    for i in d:
        print(i)
        
    print(f"{type(e)} can be any type.")


# Ref:
#   * https://myapollo.com.tw/zh-tw/python-typing-module/

Pass and Ellipsis

# Style 1
def my_abstract_method(self):
    pass

# Style 2
def my_abstract_method(self):
    ...

# Style 3
def my_abstract_method(self):
    """
    This function is ...
    """

# Ref:
#   * https://stackoverflow.com/questions/55274977/when-is-the-usage-of-the-python-ellipsis-to-be-preferred-over-pass
#   * https://stackoverflow.com/questions/772124/what-does-the-ellipsis-object-do

IPython

VSCode Python Interactive window

#%%
1+1
# 2


# Ref:
#   * https://code.visualstudio.com/docs/python/jupyter-support-py

Time Measure

One Line

%time sleep(0.3)  
# Wall time: 310 ms

%timeit sleep(0.3)
# 311 ms Β± 2.06 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each)

Multiple Lines

%%time
for i in range(10):
    sleep(0.1)
# Wall time: 1.09 s


%%timeit
for i in range(10):
    sleep(0.1)
# 1.09 s Β± 2.07 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each)

Ref

Memory Measure

Installation

!pip install -U memory_profiler

%load_ext memory_profiler

One Line

%memit [i for i in range(1000)]

# peak memory: 51.31 MiB, increment: 0.36 MiB

Multiple Lines

%%memit
l = []
for x in range(10000):
    l.append(x*2)

# peak memory: 52.76 MiB, increment: 0.70 MiB

Ref

Modules

pathlib

sub_folder = Path("subfolder/subfolder")
sub_folder.mkdir(parents=True, exist_ok=True)

file_ = sub_folder / Path("test.txt")
file_.touch()

file_.write_text("Hello")
file_.read_text()

file_.unlink()
Path("subfolder/subfolder").rmdir()

Numpy

Create Array or Matrix

np.array([[1, 2], [3, 4], [5, 6]])  # create from list

np.zeros((3, 3))  # create filled with 0's

np.ones((2, 4, 4))  # create filled with 1's

np.empty((5, 2))  # create with speed

np.arange(2, 10, 3)  # create array from range (start, end, step_size)

np.linspace(5, 50, 20)  # create a linear space (start, end, num_elements)

# create from random generator
rng = np.random.default_rng(seed=42)

rng.random((2, 4))

rng.normal(3, 2.5, size=(2, 4))  # sample from N(3, 6.25)

rng.integers(low=2, high=10, size=(10, 2))  # random integer matrix

Basic Operations

Sort and Concatenate

np.sort(a, axis=None)
np.sort(a, axis=-1)[::-1]
a.sort()
a[::-1].sort()

np.concatenate((a, b), axis=None)
np.concatenate((a, b), axis=2)

Element-wise

a = np.arange(5)           # [0, 1, 2, 3, 4]
b = np.ones(5, dtype=int)  # [1, 1, 1, 1, 1]

a + b       # [1 2 3 4 5]
a - b       # [-1  0  1  2  3]
a ^ 2       # [ 0  1  4  9 16]
a * 10      # [ 0 10 20 30 40]
a > 2       # [False False False  True  True]
np.sqrt(a)  # [0. , 1.  , 1.41421356, 1.73205081, 2. ]
a*b  # [0 1 2 3 4]
a@b  # 10

All (None) Column-wise (0), Row-wise (1)

A = np.random.default_rng(42).random((2, 4))
# [[0.77395605, 0.43887844, 0.85859792, 0.69736803],
#  [0.09417735, 0.97562235, 0.7611397 , 0.78606431]])

A.max()        # 0.97562235
A.max(axis=0)  # [0.77395605, 0.97562235, 0.85859792, 0.78606431]
A.max(axis=1)  # [0.85859792, 0.97562235]

A.mean()        # 0.6732255180088094
A.mean(axis=0)  # [0.4340667 , 0.7072504 , 0.80986881, 0.74171617]
A.mean(axis=1)  # [0.69220011, 0.65425093]

Indexing and Slicing

# Index and slicing arrays
x[1, 3] == x[1][3]
y[1:5:2, ::3]


# Indexing arrays
x[np.array([0, 1, 2, -1, -2])]
y[np.array([1, 2, 3]), 1:4:2]
y[np.array([1, 2]), np.array([-1, -1])]


# Masking arrays
x[x>5]
x[(x%2==0) | (x>7)]
y[[True]*3 + [False] + [True] + [False], 2::2]


# Ellipsis syntax
x[-1, ..., 3]  # same as x[-1, :, 3]
x[:3, ...]  # same as x[0:3, :, :] and x[0:3] and x[:3]
x[::2, ..., np.array([0, 2])]  # same as x[0:5:2, :, np.array([0, 2])]

Shape Manipulation

A = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 6, 8], [2, 1, 6]]])

A.shape  # (2, 2, 3)

A = A.reshape(3, 2, 2)  # (3, 2, 2)

A = A[np.newaxis, ...]  # (1, 3, 2, 2)

A = np.expand_dims(A, axis=4)  # (1, 3, 2, 2, 1)

A = A.flatten()  # (12,)

A = A.reshape(2, -1, 2)  # (2, 3, 2)

Copying

# shallow copy: values will change on every variable
a = np.arange(10).reshape(5, 2)
b = a.view()
c = a.reshape(-1)
d = a[:3, :1]


# deep copy: copy and create an entirely new array
a = np.arange(10000000)
b = a[:100].copy()
del a

Broadcasting

# scalar broadcasting
a = np.array([1, 2, 3])
a * 3  # [3, 6, 9]


# general broadcasting
a =  np.ones( (8, 1, 6, 1))
b = np.zeros(    (7, 1, 5))
(a*b).shape  # 8, 7, 6, 5


# outer product
a = np.arange(4)[:, np.newaxis]  # (4, 1)
b = np.array([1, 2, 3])  # (3,)

a + b  # (4, 3)

# [0]  + [1, 2, 3] =  [1 2 3]
# [1]                 [2 3 4]
# [2]                 [3 4 5]
# [3]                 [4 5 6]

Pandas

Creation and Viewing

# Create Series
pd.Series([1, 2, 3, 4, 5])
pd.Series(np.arange(1, 6), index=list("abcde"))
pd.Series({"a": 100, "b": 50, "c": 120})
pd.Series("hi", index=list("12345"))


# Create DataFrame
pd.DataFrame({
    "col_1": [1, 2, 3, 4, 5],
    "col_2": np.arange(1, 6),
    "col_3": pd.Series(np.arange(1, 7), index=list("abc123")),
}, index=list("abcde"))

pd.DataFrame(
    [
        {"a": 1, "b": 2},
        {"b": 10, "c": 5},
        {"a": 55, "b": 489, "c": 32, "d": 590},
    ],
    index=["first", "second", "third"],
    columns=list("ab")
)

pd.DataFrame(
    np.arange(10).reshape(2, 5),
    # [[0,1,2,3,4], [5,6,7,8,9]]
    index=pd.date_range("20200101", periods=2),
    columns=list("abcde"))


# Viewing
df.head(2)
df.tail(3)
df.index
df.columns
df.to_numpy()
df.sort_index()
df.sort_values("col_name")

Selection

Single Column Multiple Columns Continuous Columns All Columns
Single Row df.loc[row, column] or
df.at[row, column]
df.loc[row, [column, column]] df.loc[row, column:column] df.loc[row]
Multiple Rows df.loc[[row, row], column] df.loc[[row, row], [column, column]] df.loc[[row, row], column:column] df.loc[[row, row]]
Continuous Rows df.loc[row:row, column] df.loc[row:row, [column, column]] df.loc[row:row, column:column] df[row:row]
All Rows df[column] df[[column, column]] or
df.loc[:, [column, column]]
df.loc[:, column:column] df
df["col1"]
df[["col1", "col2"]]
df["row1":"row5"]

df.loc["row1", "col1"]  # df.iloc[0, 0]
df.at["row1", "col1"]  # df.iat[0, 0]

df.loc["row1", ["col1", "col2"]]  # df.iloc[0, [0, 1]]
df.loc["row1", "col1":"col5"]  # df.iloc[0, 0:4]

df.loc[["row1", "row2"]]  # df.iloc[[0, 1]]
df.loc["row1":"row5", "col1"]  # df.iloc[0:4, 0]


df[(df["col1"] > 18)]
df[(df > 6) & (df < 25)]
df[df["col1"].isin([10, 15, 0])]
  • df.iloc is same as df.loc but using position.
  • df.iat is same as df.at but using position.
  • Details πŸ”₯

Setting, Deleting, and Handling

# Modify columns
df["col1"] += 10
df.loc[:, "col1"] = "bar"
df.loc[:, ["col1", "col3"]] = np.arange(12).reshape(6, 2)

# Modify single element
df.loc["row1", "col1"] = 0
df.iloc[0, 0] = 1

# Modify by boolean indexing
df[df < 100] = -df

# Append
df["total"] = df.sum(axis=1).to_numpy()
df["gt"] = df["total"] > 50000
df["foo"] = "bar"

# Insert
df.insert(0, "col0", df["col2"][:2])  # col_index, col_name, values

# Delete column
del df["total"]
df.drop(columns=["foo"], inplace=True)  # same as `df.drop(["foo"], axis=1)`
gt50000 = df.pop("gt50000")

# Delete row
df.drop(["e", "d"], inplace=True)

# Handle NaN
miss_df.dropna(how='any')
miss_df.fillna(value=10000000)

Operations and Apply Functions

# Arithmetic
df + df2
df - df.iloc[0]
1 / df


# Numpy
np.sqrt(df)
np.max(df, axis=1)


# Built-in
df.mean()
df.max(axis=1)


# Apply
df.apply(np.cumsum, axis=1)
df.apply(lambda x: x.sum() / x.size)  # x means df


# Series
s.value_counts()
s.str.upper()
s.str.split("-").str.get(0)

Concat and Merge

# Concat rows
pd.concat([df[:3], df.iloc[7:, :2]])

# Merge two DataFrame
pd.merge(df, df2, on="name", how="right")

Grouping and Categorical Data Type

# Groupby
df.groupby("col_A").sum()
df.groupby(["col_A", "col_B"]).max()

# Categorical - discrete
df["grade"] = df["grade"].astype("category")
df["grade"].cat.categories = ["Bad", "Good", "Excellent"]
df.sort_values(by="grade")
df.groupby("grade").size()

# Categorical - continuous
df["grade-labels"] = pd.cut(df["score"], bins=range(0, 120, 20), labels=list("EDCBA"))

Other Pandas Tricks

# Rename Columns
df.columns = ["col_one", "col_two"]
df = df.add_prefix("Xx_")
df = df.add_suffix("_xX")
df.columns = df.columns.str.replace("Xx", "Oo")
df.columns = df.columns.str.replace("xX", "oO")


# Reverse Row or Column Order
df.loc[::-1].reset_index(drop=True) # reverse rows
df.loc[:, ::-1] # reverse columns


# Split DataFrame into 2 random subsets
sub1 = df.sample(frac=0.75, random_state=42)
sub2 = df.drop(sub1.index)
sub1.index = sub1.index.sort_values()
sub2.index = sub2.index.sort_values()


# Filter by Category (or Largest Category)
df[df.genre.isin(["A", "D"])]
df[~df.genre.isin(["A", "D"])]
df[df.genre.isin(df.genre.value_counts().nlargest(1).index)]


# Split String into Multiple Columns
df[["first", "last"]] = df["name"].str.split(' ', expand=True)
df["city"] = df["location"].str.split(", ", expand=True)[0]


# Change Display Options (Not Change Data)
pd.set_option("display.float_format", "${:.2f}".format)
pd.reset_option("display.float_format")


# Style a DataFrame
style = {"Date": "{:%Y/%m/%d}", "Value": "${:d}", "Volume": "{:,}"}
df.style.format(style) \
    .hide_index() \
    .highlight_max("Value", color="red") \
    .highlight_min("Value", color="green") \
    .bar("Area", color="orange", align="zero") \
    .background_gradient(subset="Volume", cmap="Greens") \
    .set_caption("Random Chart")

Matplotlib (Pyplot)

Basic (Single Plot)

import matplotlib.pyplot as plt

# with this magic function, we can skip `plt.show()`
%matplotlib inline

plt.plot(np.sin(np.linspace(0, 10, 100)), "*-b", lw=2, markersize=5, label="sin(x)")
plt.plot(np.log(np.arange(100)), c="g", ls="--", marker=".", lw=2, markersize=5, label="log(x)")

plt.xlabel("X here")
plt.ylabel("Y here")

plt.title("sin(x) and log(x)")

plt.grid()

plt.legend()

plt.text(x=70, y=-1, s="hahahaha")

plt.annotate("wow \nmax", xy=(16, 1), xytext=(40, 0.9), arrowprops={"facecolor": "orange", "shrink": 0.05})
plt.annotate("wow \nmax again", xy=(78, 1), xytext=(95, 0.9), arrowprops={"facecolor": "red", "shrink": 0.05})

Multiple Figures and Axes

# Object-oriented style
fig1, ax = plt.subplots()
ax.plot(...)

fig2, axs = plt.subplots(2, 1)
axs[0].plot(...)
axs[1].plot(...)


# Pyplot style
plt.figure(1)
plt.title("Figure 1")

plt.figure(2)
plt.subplot(311)
plt.title("Figure 2")

plt.subplot(323)
plt.subplot(324)

plt.subplot(337)
plt.subplot(338)
plt.subplot(339)

Line Plots and Filling Area

years = [1.1, 1.3, 1.5, 2.0, 2.2, ...]
salary = [39343.00, 46205.00, 37731.00, 43525.00, 39891.00, ...]
salary_mean = np.mean(salary)

# Line Plots
plt.plot(years, 
         salary,
         marker="o",
         markersize=5,
         lw=2,
         ls="-",
         )


# Filling Areas
plt.fill_between(years, 
                 salary,
                 salary_mean,
                 where=(salary > salary_mean),
                 alpha=.4,
                 color="green", 
                 edgecolor="black",
                 interpolate=True,
                 label="On Average"
                 )

Time Series

import matplotlib.dates as mdates

dates = np.arange(np.datetime64("2021-01-01"), np.datetime64("2021-01-22"))
prices = np.random.default_rng(42).normal(500, 30, len(dates))

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a, %d %m"))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator())

plt.plot_date(dates, prices, 
              ls="solid",
              c="orange",
              marker="^",
              markersize=10)

plt.grid()
plt.tight_layout()

Scatter Plots

temperature = [14.2, 16.4, 11.9, 15.2, ...]
ice_cream_sales = [215, 325, 185, 332, ...]
colors = np.array(ice_cream_sales) / np.linalg.norm(ice_cream_sales)

plt.scatter(temperature, ice_cream_sales, 
            s=ice_cream_sales,  # set the size according to the prices of the ice cream
            c=colors,  # set the colors according to the prices of the ice cream
            cmap="Greens",  # preferred color type
            edgecolor="black",  # the edge color of points
            lw=0.5,  # the edge width of points
            alpha=.75,
            )

plt.xlabel("temperature")
plt.ylabel("ice cream price")
plt.yscale("log")   # use log scale on y-axis to handle outliners

cbar = plt.colorbar()
cbar.set_label("Expensive")

plt.tight_layout()

Bar Charts

# Bar Charts
ages = [25, 26, 27, 28, 29, ...]
salary_all = [38496, 42000, 46752, 49320, 53200, ...]

index = np.arange(len(ages))
width = 0.25

plt.bar(index - width, salary_all, width=0.25, label="All Devs")
plt.bar(index, salary_py, width=0.25, label="Python")
plt.bar(index + width, salary_js, width=0.25, label="JavaScript")
plt.xticks(ticks=index, labels=ages)

plt.title("Median Salary (USD) by Age")
plt.xlabel("Ages")
plt.ylabel("Median Salary (USD)")
plt.legend()
plt.tight_layout()


# Horizontal Bar Charts
language = ['JavaScript', 'HTML/CSS', 'SQL', 'Python', ...]
popularity = [59219, 55466, 47544, 36443, ...]

plt.barh(language, popularity)
plt.title("Most Popular Languages")
plt.xlabel("Number of People Who Use")
plt.tight_layout()

Pie Charts

grade = ["A", "B", "C", "D", "E"]
number = [10, 18, 23, 8, 5]
explode = [0.1, 0, 0, 0, 0]

plt.pie(number, 
        labels=grade,
        shadow=True,
        autopct="%1.1f%%",
        pctdistance=0.6,
        startangle=90,
        explode=explode
        )

plt.title("Test Grade")
plt.tight_layout()

Histograms

height_stats = np.random.default_rng(42).normal(160, 15, 1000)

interval_bin = [120, 130, 140, 150, 160, 170, 180, 190, 200]
plt.hist(height_stats, bins=interval_bin,
         edgecolor="black", lw=1, density=True)

# Plot the probability density curve
import scipy.stats as ss
density = ss.kde.gaussian_kde(height_stats)
index = np.arange(120, 200)
plt.plot(index, 
         density.evaluate(index), 
         color="pink",
         lw=3,
         ls="--",
         label="Probability Density")

# Plot the mean line
plt.axvline(np.mean(height_stats), c="orange", lw=5, label="Height Mean")
plt.legend()

plt.title("Height Stats")
plt.xlabel("Heights")
plt.ylabel("Probability Density")
plt.tight_layout()

Stack Plots

years = [1950, 1960, 1970, 1980, 1990, 2000, 2010, 2018]

population_by_continent = {
    'africa': [228, 284, 365, 477, 631, 814, 1044, 1275],
    'americas': [340, 425, 519, 619, 727, 840, 943, 1006],
    'asia': [1394, 1686, 2120, 2625, 3202, 3714, 4169, 4560],
    'europe': [220, 253, 276, 295, 310, 303, 294, 293],
    'oceania': [12, 15, 19, 22, 26, 31, 36, 39],
}

y = population_by_continent.values()
labels = population_by_continent.keys()
colors = ["#96ceb4", "#ffeead", "#ff6f69", "#ffcc5c", "#88d8b0"]

plt.style.use("seaborn")
plt.stackplot(years, y, labels=labels, colors=colors)

plt.legend(loc="upper left")
plt.title("World Population")
plt.xlabel("Year")
plt.ylabel("Population (Millions)")
plt.tight_layout()

Image

img = mpimg.imread("https://www.catster.com/wp-content/uploads/1970/01/Am-ShortHair-breed_getty1140883355-768x513.png")
plt.imshow(img)

# Applying pseudocolor schemes
plt.imshow(img[..., 0], cmap="gray")
plt.colorbar()

# Flipping Photos Vertically or Horizontally
plt.imshow(img[::-1])  # Reverse at the first axis == vertical flip
plt.imshow(img[:, ::-1])  # Reverse at the second axis == horizontal flip

Styles, Colors, Colormaps

# Switch Style
plt.style.use("seaborn-pastel")

# Data
x = np.random.default_rng(42).integers(0, 100, 100)
y = (2*x+1) * np.random.default_rng(43).normal(5, 1, 100)

regr = sklearn.linear_model.LinearRegression()
regr.fit(x[:, np.newaxis], y[:, np.newaxis])
regr_line = regr.predict(x[:, np.newaxis])

# Plotting with fancy color and colormap
plt.scatter(x, y, c=y, alpha=0.25, cmap="plasma")

plt.plot(x, regr_line, 
         color="darkviolet", 
         alpha=0.5,
         lw=5, ls="-",
         label="regression line")


plt.title("Linear Regression Test")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.colorbar()

Seaborn

Basic (Seaborn)

x = np.array(range(1, 5))
y = x**2
df = pd.DataFrame(zip(x, y), columns=["col_1", "col_2"])

# Plotting with data parameter
def plot():
    sns.lineplot(x="col_1", y="col_2", data=df)

# Seaborn Styles
sns.set_style("white")

# Scaling the plots
sns.set_context("paper", font_scale=1.5)

# Changing the figure Size
plt.figure(figsize=(8, 4))  # width, height 

# Using Seaborn with Matplotlib
plt.subplot(211)
plt.title("Square X")
plot()

# Seaborn Styles Context Manager
with sns.axes_style("darkgrid"):
    plt.subplot(212)
    plot()

plt.tight_layout()

Color Palette

# Sequential Palette
palette = sns.color_palette("YlGn")
sns.palplot(palette)
plt.title("YlGn Colormap (Sequential)")

# Diverging Palette
palette = sns.color_palette("coolwarm")
sns.palplot(palette)
plt.title("coolwarm Colormap (Diverging)")

# Qualitative Palette
palette = sns.color_palette("Pastel2")
sns.palplot(palette)
plt.title("Pastel2 Colormap (Qualitative)")



Multiple Plots

Using Matplotlib

data = sns.load_dataset("iris")

plt.figure(figsize=(11, 3))
plt.subplot(121)
sns.lineplot(x="sepal_length", y="sepal_width", data=data)

plt.subplot(122)
sns.lineplot(x="petal_length", y="petal_width", data=data)

Using Seaborn

FacetGrid

grid = sns.FacetGrid(data, col="species")
grid.map(plt.plot, "sepal_width")

PairGrid

x_vars = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
y_vars = ["species"]

grid = sns.PairGrid(data, x_vars=x_vars, y_vars=y_vars)
grid.map(sns.barplot)

About

The goal of PythonUniverse is to collect and organize Python skills in one place. We've collected some Python techniques, Python modules, and some libraries for data science. Each Topic includes code snippet and further explanation.

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published