Initialize Optimus
from optimus import Optimus
op = Optimus("pandas")
Creating DataFrames
df = op.create.dataframe({
("names"): ["bumbl#ebéé ", "Optim'us", "ironhide&"],
("height", "float"): [17.5, 28.0, 26.0],
"function": ["Espionage", "Leader", "Security"],
("rank", "int"): [7, 10, 7]
})
| names |
height |
function |
rank |
| bumbl#ebéé⸱⸱ |
17.5 |
Espionage |
7 |
| Optim'us |
28.0 |
Leader |
10 |
| ironhide& |
26.0 |
Security |
7 |
Data Loading
df = op.load.csv("foo.csv")
df = op.load.json("foo.json")
df = op.load.parquet("foo.parquet")
df = op.load.excel("foo.xls")
df = op.load.avro("foo.avro")
df = op.load.file("foo.anything")
df = op.load.csv("http://.../foo.csv")
df = op.load.json("http://./f.json")
df = op.load.file("http://./f.parquet")
Data Saving
df.save.csv("directory/foo.csv")
df.save.json("directory/foo.json")
df.save.parquet("directory/foo.parquet")
df.save.excel("directory/foo.xls")
df.save.avro("directory/foo.avro")
Method Chaining
df = df\
.rows.sort(["rank", "height"])\
.cols.lower(["names", "function"])\
.cols.normalize_chars("names")\
.cols.remove_special_chars("names")\
.cols.trim("names")
| names |
height |
function |
rank |
| optimus |
28.0 |
leader |
10 |
| ironhide |
26.0 |
security |
7 |
| bumblebee |
17.5 |
espionage |
7 |
Summarize Data
df.cols.sum()
df.cols.min()
df.cols.max()
df.cols.median()
df.cols.mean()
df.cols.std()
df.cols.quantile([0.25,0.75])
Machine Learning
model = df.ml.logistic_regression_text("sentence")
model = df.ml.random_forest("diagnosis")
model = df.ml.decision_tree(cols, "diagnosis")
model = df.ml.gbt(cols, "diagnosis")
Reshaping Data
df = df.melt(
"names",
["height", "function", "rank"])
| names |
height |
function |
rank |
| optimus |
28.0 |
leader |
10 |
| ironhide |
26.0 |
security |
7 |
| bumblebee |
17.5 |
espionage |
7 |
🡆
| names |
variable |
value |
| optimus |
height |
28.0 |
| optimus |
function |
leader |
| optimus |
rank |
10 |
| ironhide |
height |
26.0 |
| ironhide |
function |
security |
| ironhide |
rank |
7 |
| bumblebee |
height |
17.5 |
| bumblebee |
function |
espionage |
| bumblebee |
rank |
7 |
df = df.pivot("names", "variable", "value")
| names |
variable |
value |
| optimus |
height |
28.0 |
| optimus |
function |
leader |
| optimus |
rank |
10 |
| ironhide |
height |
26.0 |
| ironhide |
function |
security |
| ironhide |
rank |
7 |
| bumblebee |
height |
17.5 |
| bumblebee |
function |
espionage |
| bumblebee |
rank |
7 |
🡆
| names |
height |
function |
rank |
| optimus |
28.0 |
leader |
10 |
| ironhide |
26.0 |
security |
7 |
| bumblebee |
17.5 |
espionage |
7 |
df = df.rows.append([
"Grimlock",
"Commander",
"80",
"9"
])
| names |
height |
function |
rank |
| optimus |
28.0 |
leader |
10 |
| ironhide |
26.0 |
security |
7 |
| bumblebee |
17.5 |
espionage |
7 |
| names |
height |
function |
rank |
| Grimlock |
80 |
Commander |
9 |
}
| names |
height |
function |
rank |
| optimus |
28.0 |
leader |
10 |
| ironhide |
26.0 |
security |
7 |
| bumblebee |
17.5 |
espionage |
7 |
| Grimlock |
80 |
Commander |
9 |
df = df.rows.sort("names")
df = df.rows.sort("names", "asc")
df = df.cols.rename("names", "name")
df = df.cols.rename([
("name", "names"),
("function", "task")])
Handling Missing Data
df.rows.drop_na(cols)
df.cols.fill_na(cols, output_cols, value)
df.cols.impute(cols, strategy="mean")
Make new columns
df.cols.append("new_rank", df["rank"]+"1")
df.cols.qcut("height", "bins", 2)
String Processing
Key Collision
df.cols.fingerprint(df, "names")
df.string_clustering("names", "fingerprint")
df.cols.n_gram_fingerprint(df, "names", 2)
df.string_clustering("names", "n_gram_fingerprint", 2)
Select Rows
df.display(n)
df.rows.drop_duplicated()
df.rows.sample(n)
df.rows.select(df["rank"]>7)
df["A"] < df["B"]
|
Less than |
df["A"] > df["B"]
|
Greater than |
df["A"] == df["B"]
|
Equal to |
df["A"] <= df["B"]
|
Less than or equal to |
df["A"] >= df["B"]
|
Greater than or equal to |
df["A"] != df["B"]
|
Not equal to |
~df["A"]
|
Negation |
df["A"] & df["B"]
|
And |
df["A"] | df["B"]
|
Or |
Select Columns
df.cols.select(["names", "height", "function"])
df.cols.select([1, 3, 5])
df.cols.select("n.*", regex=True)
| '\. |
Matches strings containing a period |
| 'Length$' |
Matches strings ending wirt word 'Length' |
| '^$epal |
Matches strings begining with the word 'Sepal' |
| '^x[1-5]$ |
Matches strings beginning with 'x' and ending with 1,2,3,4,5 |
| ''^(?!Species$).*' |
Matches strings expect the string 'Species' |
Unnest
df = df\
cols.unnest("col_to_unnest")
Nest
df = df\
.cols.nest(["names", "function"],
output_col = "nested_col",
separator=" ")
df = df
.cols.nest(["names", "function"],
output_col = "new_col",
shape ="array")
Plotting
df.plot.hist("*")
df.plot.frequency("*")
df.plots.correlation("*")
Profiling
df.profile("*")
df.profile("names")
df.profile(["names", "height"])