Invocation notes for some useful patterns to me.

– Local system utilities –

Password generation

openssl rand -base64 $numCharacters

Split a newline delimited file

zcat $VAL.gz | split - -l 1500000 --filter='gzip > $FILE.gz' $VAL.gz.part.

Sum across file size for all files matching a glob pattern

find . --name "*.normed.gz" -ls | awk '{total += $7} END {print total}'

Count lines in gzipped file

gzcat filename.gz | wc -l

Local machine activity

top -o cpu
top -o mem
nettop

Converting images

From .avif to .png:

magick mogrify -format png -depth 10 -define heic:speed=2 *.avif

– Google Cloud –

vim ~/.config/gcloud
gcloud config configurations list
gcloud config configurations activate $acct
gcloud compute ssh $machine
gcloud compute scp localfile $machine:~
gsutil cp gs://path/to/file local.file

Bigtable

cbt ls
cbt ls $table
cbt -project $p -instance $i count $table
cbt -project $p -instance $i read $table prefix='' count=10

BigQuery

ARRAY(SELECT JSON_VALUE(singleton, '$.name') FROM UNNEST(JSON_QUERY_ARRAY(jsonBlob, '$')) singleton) AS fieldValuesWithinRepeatedJsonSchema

ARRAY(SELECT value FROM UNNEST(keyValuePairs) WHERE key = "Date" ORDER BY value ASC)[SAFE_OFFSET(0)] AS earliestDate

ARRAY(SELECT x FROM UNNEST([itemThatMightBeNull, otherItemThatMightBeNull]) x WHERE x is NOT NULL)

ARRAY(SELECT s.path.to.field FROM UNNEST(elements) s WHERE s.property IS NOT NULL and s.otherProperty = "value")

NET.IP_FROM_STRING, NET.IPV4_to_INT64, NET.IP_TRUNC, NET.PUBLIC_SUFFIX, NET.REG_DOMAIN, NET.HOST

– Docker –

docker run -e env_var=$value \\
	-v /Users/path/to/Documents/some/folder/to/mount:/user \\
	--publish $hostLocalPort:$containerizedServicePort \\
	--name $yourChoice $imageIdentifier
docker run -it --entrypoint /bin/bash $imageName:$tag

Access a database that is portforwarded to localhost on host from within container:

postgres://$USER:$PASS@docker.for.mac.host.internal:$localPort/$dbName

Kill Docker:

pkill -SIGHUP -f /Applications/Docker.app/ 'docker serve'

– Kubernetes –

Port forwarding:

kubectl --namespace=$namespace port-forward svc/$serviceName $localPort:$remotePort

– .ssh/config –

Host $location-or-*
  User $name
  HostName $ip
  LocalForward $localPort fully.qualified.domain.name:$remotePort
  IdentityFile ~/.ssh/id_rsa
  IgnoreUnknown UseKeychain AddKeysToAgent
  UseKeychain yes
  AddKeysToAgent yes

– Python –

pyenv

pyenv install 3.11
pyenv versions
pyenv local|global|shell 3.11

Environment management

poetry pipenv isolated conda env with pip
Initialize poetry init (pipenv install any library to create a Pipfile) conda create -n $name python=3.$version -yes
Add dependency poetry add $library [--group $group] pipenv install $library pip install $library
Add dependency to next major version poetry add $library@^2.0.5 pipenv install $library~=2.0.5 N/A (no ability to specify)
Update dependencies poetry update pipenv install N/A (cannot update all in one command)
Update a dependency poetry update $library pipenv update $library (side-effect: updates all) pip install $library --upgrade
Create a lockfile poetry lock [--no-update] pipenv lock N/A
Start shell poetry shell pipenv shell conda activate $name
Run in environment poetry run python $script.py pipenv run python $script.py N/A
Install from lockfile poetry install pipenv sync N/A
Install in developer mode (with pointers) N/A (included in poetry install) pipenv install -e .[all] pip install -e .[all]

New project configuration

To create a set of “dev-only” dependencies in poetry (all group dependencies get installed unless they are explicitly marked optional):

poetry add --group dev $packageName
Makefile
ROOTDIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

format:
	cd ${ROOTDIR}; python -m isort . --profile black
	cd ${ROOTDIR}; python -m black --target-version py39 .
	cd ${ROOTDIR}; nbstripout notebook/*
	
lint:
	cd ${ROOTDIR}; python -m mypy --install-types --non-interactive --check-untyped-defs .
	cd ${ROOTDIR}; python -m isort * --check-only --profile black
	cd ${ROOTDIR}; python -m flake8 .
	cd ${ROOTDIR}; python -m black --check .
	
test:
	# one of the following
	cd ${ROOTDIR}; pytest
	cd ${ROOTDIR}; python -m unittest discover
.pre-commit-config.yaml
repos:
- repo: local
  hooks:
    - id: format
      name: Format
      stages: [commit, push]
      language: system
      entry: make format
    - id: lint
      name: Lint
      stages: [commit, push]
      language: system
      entry: make lint
setup.cfg
[metadata]
description-file = README.md

[flake8]
ignore = E121,E203,E251,E261,E266,E302,E303,E305,E402,E501,F841,W503,E741,W605
exclude =
  .git,
  .eggs,
  .tox,
  build,
  dist,
  data,
  *.egg-info,
  notebooks,
  .mypy*,
  *.db

# Flake8 Ignored Context
# Codes:         http://flake8.pycqa.org/en/latest/user/error-codes.html
# E121: continuation line under-indented for hanging indent
# E203: black enforces white space around slice operators
# E251: unexpected spaces around keyword / parameter equals
# E261: at least two spaces before inline comment
# E266: too many leading '#' for block comment; we might want blocks starting with #####
# E302: too many blank lines
# E303: too many blank lines
# E305: expected 2 blank lines after class or function definition
# E402: module level import not at top of file; useful for putting docstrings at top of module before imports
# E501: line too long
# F841: local variable is assigned to but never used; we might want the pandas syntax `df.query('blah = @variable')`
# W503: line break before binary operator
# E741: ambiguous variable name
# W605: invalid escape sequence; triggers on valid regex expression like re.search('\W')
Ignoring missing mypy types

pyproject.toml:

[[tool.mypy.overrides]]
module = [
    "tqdm"
]
ignore_missing_imports = true
Configuring poetry for a custom binary repo

pyproject.toml:

[[tool.poetry.source]]
name = "$customBinaryRepository"
url = "https://path.to.custom.binary.repository/pip-private/simple"
default = true # disables PyPI
secondary = false

[[tool.poetry.source]]
name = "pypi_secondary" # `pypi` is reserved for default PyPI source
url = "https://pypi.org/simple"
default = false
secondary = true

Configuration (see https://python-poetry.org/docs/repositories/):

poetry config http-basic.$customBinaryRepository <USERNAME_WITH_@> <PASSWORD_API_KEY>

Jupyter notebook setup

%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(name)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    force=True)

LOGGER = logging.getLogger(__name__)

Testing

python -m unittest test_package.test_module.TestClass.test_method
pytest path/to/test_file.py::test_method

t-SNE

Derived from code by Chris Potts (Stanford).

import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def tsne_viz(df, colors=None, output_filename=None, figsize=(40, 50), random_state=42):
    """
    2d plot of `df` using t-SNE, with the points labeled by `df.index`,
    aligned with `colors` (defaults to all black).
   
    Source: https://github.com/cgpotts/cs224u/blob/afd64b41f845b0f444b152d0f7acf2a45228349a/vsm.py#L188
    """
    # Colors:
    vocab = df.index
    if not colors:
        colors = ['black' for i in vocab]
    # Recommended reduction via PCA or similar:
    n_components = 50 if df.shape[1] >= 50 else df.shape[1]
    dimreduce = PCA(n_components=n_components, random_state=random_state)
    X = dimreduce.fit_transform(df)
    print(f"Explained variance ratio: {np.round(dimreduce.explained_variance_ratio_, 2)}")
    # t-SNE:
    tsne = TSNE(n_components=2, random_state=random_state)
    tsnemat = tsne.fit_transform(X)
    # Plot values:
    xvals = tsnemat[: , 0]
    yvals = tsnemat[: , 1]
    # Plotting:
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    ax.plot(xvals, yvals, marker='', linestyle='')
    # Text labels:
    for word, x, y, color in zip(vocab, xvals, yvals, colors):
        ax.annotate(word, (x, y), fontsize=8, color=color)
    plt.axis('off')
    # Output:
    if output_filename:
        plt.savefig(output_filename, bbox_inches='tight')
    else:
        plt.show()