import numpy as np
np.__version__
'1.18.1'
import array
L = list(range(10))
A = array.array('i', L) # i 是一个数据类型码,表示数据为整数
A
array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# 整型数组
np.array([1, 4, 2, 5, 3])
array([1, 4, 2, 5, 3])
np.array([3.14, 4, 2, 3])
array([3.14, 4. , 2. , 3. ])
np.array([1, 2, 3 ,4], dtype='float32')
array([1., 2., 3., 4.], dtype=float32)
np.array([range(i, i + 3) for i in [2, 4, 6]])
array([[2, 3, 4],
[4, 5, 6],
[6, 7, 8]])
# 创建一个长度为10的数组,数组的值为0
np.zeros(10, dtype=int)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# 创建一个3×5的浮点型数组,数组值都是1
np.ones((3, 5), dtype=float)
array([[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]])
# 创建一个3×5的浮点型数组,数组值都是3.14
np.full((3, 5), 3.14)
array([[3.14, 3.14, 3.14, 3.14, 3.14],
[3.14, 3.14, 3.14, 3.14, 3.14],
[3.14, 3.14, 3.14, 3.14, 3.14]])
# 创建一个线性序列数组
# 从0开始,到20结束,步长为2
# 和内置range()差不多
np.arange(0, 20, 2)
array([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18])
# 创建一个5个元素的数组,这5个数均匀的分配到0~1
np.linspace(0, 1, 5)
array([0. , 0.25, 0.5 , 0.75, 1. ])
# 创建一个3×3的、在0~1均匀分布的随机数组成的数组
# 只能获取0~1范围随机数,其他范围采用乘法即可
np.random.random((3, 3))
array([[0.4578806 , 0.92203829, 0.05448214],
[0.78920131, 0.35137756, 0.02365432],
[0.98200314, 0.96379388, 0.98053799]])
# 创建一个3×3的、均值为0、标准差为1的
# 正态分布的随机数数组
np.random.normal(0, 1, (3, 3))
array([[ 0.40610554, 0.81449554, -0.38442201],
[ 0.67406335, -0.42994394, -0.25169288],
[-1.35054767, 0.12160682, 0.47200046]])
# 创建一个3×3的,[0, 10)区间的随机整型数组
np.random.randint(0, 10, (3, 3))
array([[7, 0, 2],
[1, 2, 3],
[2, 5, 3]])
# 创建一个3×3的单位矩阵
np.eye(3)
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
# 创建一个由3个整型数组成的未初始化的数组
# 数组值是内存空间中的任意值
np.empty(3)
array([1., 1., 1.])
np.zeros(10, dtype='int16')
或者
np.zeros(10, dtype=np.int16)
Data type | Description |
---|---|
bool_ |
Boolean (True or False) stored as a byte |
int_ |
Default integer type (same as C long ; normally either int64 or int32 ) |
intc |
Identical to C int (normally int32 or int64 ) |
intp |
Integer used for indexing (same as C ssize_t ; normally either int32 or int64 ) |
int8 |
Byte (-128 to 127) |
int16 |
Integer (-32768 to 32767) |
int32 |
Integer (-2147483648 to 2147483647) |
int64 |
Integer (-9223372036854775808 to 9223372036854775807) |
uint8 |
Unsigned integer (0 to 255) |
uint16 |
Unsigned integer (0 to 65535) |
uint32 |
Unsigned integer (0 to 4294967295) |
uint64 |
Unsigned integer (0 to 18446744073709551615) |
float_ |
Shorthand for float64 . |
float16 |
Half precision float: sign bit, 5 bits exponent, 10 bits mantissa |
float32 |
Single precision float: sign bit, 8 bits exponent, 23 bits mantissa |
float64 |
Double precision float: sign bit, 11 bits exponent, 52 bits mantissa |
complex_ |
Shorthand for complex128 . |
complex64 |
Complex number, represented by two 32-bit floats |
complex128 |
Complex number, represented by two 64-bit floats |
import numpy as np
np.random.seed(0) # 设置随机数种子
x1 = np.random.randint(10, size=6) # 一维数组
x2 = np.random.randint(10, size=(3, 4)) # 二维数组
x3 = np.random.randint(10, size=(3, 4, 5)) # 三维数组
每个数组都有 ndim(数组的维度)、shape(数组每个维度的大小)、size(数组的大小)、dtype(数组的数据类型)属性:
print('x3 ndim : ', x3.ndim)
print('x3 shape: ', x3.shape)
print('x3 size : ', x3.size)
print('x3 dtype: ', x3.dtype)
x3 ndim : 3
x3 shape: (3, 4, 5)
x3 size : 60
x3 dtype: int64
其他属性还包括代表每个数组元素字节大小的 itemsize,以及代表数组总字节大小的属性 nbytes:
一般来说可以认为 nbytes = itemsize × size
print('itemsize:', x3.itemsize, 'bytes')
print('nbytes :', x3.nbytes, 'bytes')
itemsize: 8 bytes
nbytes : 480 bytes
x1
array([5, 0, 3, 3, 7, 9])
x1[0]
5
x2
array([[3, 5, 2, 4],
[7, 6, 8, 8],
[1, 6, 7, 7]])
x2[0][0]
3
x2[0][0] = 12
x2
array([[12, 5, 2, 4],
[ 7, 6, 8, 8],
[ 1, 6, 7, 7]])
# 但是如果数据类型不同,会自动截取
x1[0] = 3.1415926
x1
array([3, 0, 3, 3, 7, 9])
x[start:stop:step]
x = np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x[:5]
array([0, 1, 2, 3, 4])
x[::2]
array([0, 2, 4, 6, 8])
x2
array([[12, 5, 2, 4],
[ 7, 6, 8, 8],
[ 1, 6, 7, 7]])
x2[:2, :3] # 两行,三列
array([[12, 5, 2],
[ 7, 6, 8]])
x2[:3, ::2] # 三行,每隔一列
array([[12, 2],
[ 7, 8],
[ 1, 7]])
x2[::-1, ::-1] # 全部逆序
array([[ 7, 7, 6, 1],
[ 8, 8, 6, 7],
[ 4, 2, 5, 12]])
print(x2[:, 0]) # 第一列
[12 7 1]
print(x2[0, :]) # 第一行
[12 5 2 4]
# 处于简便,可以省略
print(x2[0])
[12 5 2 4]
print(x2)
[[12 5 2 4]
[ 7 6 8 8]
[ 1 6 7 7]]
x2_sub = x2[:2, :2]
x2_sub
array([[12, 5],
[ 7, 6]])
x2_sub[0, 0] = 99
x2_sub
array([[99, 5],
[ 7, 6]])
print(x2)
[[99 5 2 4]
[ 7 6 8 8]
[ 1 6 7 7]]
x2_sub_copy = x2[:2, :2].copy()
print(x2_sub_copy)
[[99 5]
[ 7 6]]
x2_sub_copy[0, 0] = 42
print(x2_sub_copy)
[[42 5]
[ 7 6]]
print(x2)
[[99 5 2 4]
[ 7 6 8 8]
[ 1 6 7 7]]
x = np.array([1, 2, 3])
x.reshape((3, 1))
array([[1],
[2],
[3]])
print(x)
[1 2 3]
x[:, np.newaxis]
array([[1],
[2],
[3]])
print(x)
[1 2 3]
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
np.concatenate([x, y])
array([1, 2, 3, 3, 2, 1])
z = [99, 99, 99]
print(np.concatenate([x, y, z]))
[ 1 2 3 3 2 1 99 99 99]
grid = np.array([[1, 2, 3],
[4, 5, 6]])
# 沿着第一个轴进行拼接
np.concatenate([grid, grid])
array([[1, 2, 3],
[4, 5, 6],
[1, 2, 3],
[4, 5, 6]])
# 沿着第二个轴进行拼接
np.concatenate([grid, grid], axis=1)
array([[1, 2, 3, 1, 2, 3],
[4, 5, 6, 4, 5, 6]])
x = np.array([1, 2, 3])
grid = np.array([[9, 8, 7],
[6, 5, 4]])
# 垂直栈数组
np.vstack([x, grid])
array([[1, 2, 3],
[9, 8, 7],
[6, 5, 4]])
# 水平栈数组
y = np.array([[99],
[99]])
np.hstack([grid, y])
array([[ 9, 8, 7, 99],
[ 6, 5, 4, 99]])
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split(x, [3, 5]) # 从 3、5 位拆分成 3个列表
print(x1, x2, x3)
[1 2 3] [99 99] [3 2 1]
grid = np.arange(16).reshape((4, 4))
grid
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[12, 13, 14, 15]])
upper, lower = np.vsplit(grid, [2])
print(upper)
print(lower)
[[0 1 2 3]
[4 5 6 7]]
[[ 8 9 10 11]
[12 13 14 15]]
left, right = np.hsplit(grid, [2])
print(left)
print(right)
[[ 0 1]
[ 4 5]
[ 8 9]
[12 13]]
[[ 2 3]
[ 6 7]
[10 11]
[14 15]]
import numpy as np
np.random.seed(0)
def compute_reciprocals(val):
output = np.empty(len(val))
for i in range(len(val)):
output[i] = 1.0 / val[i]
return output
val = np.random.randint(1, 10, size=5)
compute_reciprocals(val)
array([0.16666667, 1. , 0.25 , 0.25 , 0.125 ])
# 在手机都已每秒十亿次浮点计算为单位计算处理速度时,这个计算这样耗时很明显是不正常的
big_array = np.random.randint(1,100, size=1000000)
%timeit compute_reciprocals(big_array)
1.65 s ± 8.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
print(compute_reciprocals(val))
print(1.0 / val)
[0.16666667 1. 0.25 0.25 0.125 ]
[0.16666667 1. 0.25 0.25 0.125 ]
# 可以看到计算结果比Python循环的时间更短
%timeit (1.0 / big_array)
1.11 ms ± 9.69 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
np.arange(5) / np.arange(1, 6)
array([0. , 0.5 , 0.66666667, 0.75 , 0.8 ])
x = np.arange(9).reshape((3, 3))
2 ** x
array([[ 1, 2, 4],
[ 8, 16, 32],
[ 64, 128, 256]])
x = np.arange(4)
print("x =", x)
print("x + 5 =", x + 5)
print("x - 5 =", x - 5)
print("x * 2 =", x * 2)
print("x / 2 =", x / 2)
print("x // 2 =", x // 2)
print("-x =", -x)
print("x ** 2 =", x ** 2)
print("x % 2 =", x % 2)
x = [0 1 2 3]
x + 5 = [5 6 7 8]
x - 5 = [-5 -4 -3 -2]
x * 2 = [0 2 4 6]
x / 2 = [0. 0.5 1. 1.5]
x // 2 = [0 0 1 1]
-x = [ 0 -1 -2 -3]
x ** 2 = [0 1 4 9]
x % 2 = [0 1 0 1]
-(0.5*x + 1) ** 2
array([-1. , -2.25, -4. , -6.25])
np.add(x, 2)
array([2, 3, 4, 5])
NumPy实现的算数运算符
运算符 | 对应的通用函数 | 描述 |
---|---|---|
+ |
np.add |
加法运算 (e.g., 1 + 1 = 2 ) |
- |
np.subtract |
减法运算 (e.g., 3 - 2 = 1 ) |
- |
np.negative |
负数运算 (e.g., -2 ) |
* |
np.multiply |
乘法运算 (e.g., 2 * 3 = 6 ) |
/ |
np.divide |
除法运算 (e.g., 3 / 2 = 1.5 ) |
// |
np.floor_divide |
向下整除运算 (e.g., 3 // 2 = 1 ) |
** |
np.power |
指数运算 (e.g., 2 ** 3 = 8 ) |
% |
np.mod |
模/余数 (e.g., 9 % 4 = 1 ) |
x = np.arange(-2, 3)
x
array([-2, -1, 0, 1, 2])
abs(x)
array([2, 1, 0, 1, 2])
np.absolute(x)
array([2, 1, 0, 1, 2])
np.abs(x)
array([2, 1, 0, 1, 2])
# 绝对值返回的是该负数的模
x = np.array([3-4j, 4-3j, 2+0j, 0+1j])
np.abs(x)
array([5., 5., 2., 1.])
theta = np.linspace(0, np.pi, 3)
print("theta = ", theta)
print("sin(theta) = ", np.sin(theta))
print("cos(theta) = ", np.cos(theta))
print("tan(theta) = ", np.tan(theta))
theta = [0. 1.57079633 3.14159265]
sin(theta) = [0.0000000e+00 1.0000000e+00 1.2246468e-16]
cos(theta) = [ 1.000000e+00 6.123234e-17 -1.000000e+00]
tan(theta) = [ 0.00000000e+00 1.63312394e+16 -1.22464680e-16]
# 逆三角函数
x = [-1, 0, 1]
print("x = ", x)
print("arcsin(x) = ", np.arcsin(x))
print("arccos(x) = ", np.arccos(x))
print("arctan(x) = ", np.arctan(x))
x = [-1, 0, 1]
arcsin(x) = [-1.57079633 0. 1.57079633]
arccos(x) = [3.14159265 1.57079633 0. ]
arctan(x) = [-0.78539816 0. 0.78539816]
# 指数运算
x = [1, 2, 3]
print("x =", x)
print("e^x =", np.exp(x))
print("2^x =", np.exp2(x))
print("3^x =", np.power(3, x))
x = [1, 2, 3]
e^x = [ 2.71828183 7.3890561 20.08553692]
2^x = [2. 4. 8.]
3^x = [ 3 9 27]
# 对数运算
x = [1, 2, 4, 10]
print("x =", x)
print("ln(x) =", np.log(x))
print("log2(x) =", np.log2(x))
print("log10(x) =", np.log10(x))
x = [1, 2, 4, 10]
ln(x) = [0. 0.69314718 1.38629436 2.30258509]
log2(x) = [0. 1. 2. 3.32192809]
log10(x) = [0. 0.30103 0.60205999 1. ]
# 特殊运算
x = [0, 0.001, 0.01, 0.1]
print("exp(x) - 1 =", np.expm1(x))
print("log(1 + x) =", np.log1p(x))
exp(x) - 1 = [0. 0.0010005 0.01005017 0.10517092]
log(1 + x) = [0. 0.0009995 0.00995033 0.09531018]
双曲三角函数
、比特位运算
、比较运算符
、弧度转化为角度的运算
、取整
、取余
等等scipy.special
。from scipy import special
# Gamma函数(广义阶乘)和相关函数
x = [1, 5, 10]
print("gamma(x) =", special.gamma(x))
print("ln|gamma(x)| =", special.gammaln(x))
print("beta(x, 2) =", special.beta(x, 2))
gamma(x) = [1.0000e+00 2.4000e+01 3.6288e+05]
ln|gamma(x)| = [ 0. 3.17805383 12.80182748]
beta(x, 2) = [0.5 0.03333333 0.00909091]
# 误差函数(高斯积分)
# 他的实现和它的逆实现
x = np.array([0, 0.3, 0.7, 1.0])
print("erf(x) =", special.erf(x))
print("erfc(x) =", special.erfc(x))
print("erfinv(x) =", special.erfinv(x))
erf(x) = [0. 0.32862676 0.67780119 0.84270079]
erfc(x) = [1. 0.67137324 0.32219881 0.15729921]
erfinv(x) = [0. 0.27246271 0.73286908 inf]
x = np.arange(5) # 生成一个有序序列
y = np.empty(5) # 生成一个无序序列
np.multiply(x, 10, out=y) # 执行乘法,结果输出给 y
print(y)
[ 0. 10. 20. 30. 40.]
y = np.zeros(10)
np.power(2, x, out=y[::2])
print(y)
[ 1. 0. 2. 0. 4. 0. 8. 0. 16. 0.]
注意:在一些特殊的情况中,NumPy提供了专用的函数(np.sum、np.prod、np.cumsum、np.cumprod),他们也可以实现上面 reduce 的功能
例如:对 add 通用函数调用 reduce 方法会返回数组中的所有元素和:
x = np.arange(1, 6)
np.add.reduce(x)
15
np.multiply.reduce(x)
120
# 储存每次计算的结果
np.add.accumulate(x)
array([ 1, 3, 6, 10, 15])
np.multiply.accumulate(x)
array([ 1, 2, 6, 24, 120])
x = np.arange(1, 6)
np.multiply.outer(x, x)
array([[ 1, 2, 3, 4, 5],
[ 2, 4, 6, 8, 10],
[ 3, 6, 9, 12, 15],
[ 4, 8, 12, 16, 20],
[ 5, 10, 15, 20, 25]])
import numpy as np
L = np.random.random(100)
# Python 自带
sum(L)
53.390207567935924
# NumPy 提供的,更快一些
np.sum(L)
53.3902075679359
big_array = np.random.random(1000000)
%timeit sum(big_array)
%timeit np.sum(big_array)
119 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
311 µs ± 7.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
min(big_array), max(big_array)
(2.657882510970211e-06, 0.9999997356284774)
np.min(big_array), np.max(big_array)
(2.657882510970211e-06, 0.9999997356284774)
%timeit min(big_array)
%timeit np.min(big_array)
73 ms ± 510 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
329 µs ± 5.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
print(big_array.min(), big_array.max(), big_array.sum())
2.657882510970211e-06 0.9999997356284774 500652.34777085617
M = np.random.random((3, 4))
print(M)
[[0.86034037 0.93821946 0.87046115 0.97764813]
[0.99807947 0.06686359 0.07370805 0.54976723]
[0.11073139 0.44798236 0.79951814 0.54598933]]
默认情况下,每个NumPy聚合函数将会返回对整个数组的聚合结果:
M.sum()
7.239308667849184
聚合函数还有一个参数,用于指向沿着哪个 轴 的方向进行聚合
M.min(axis=0) # 每一列最小值
array([0.11073139, 0.06686359, 0.07370805, 0.54598933])
M.max(axis=1) # 每一个行最大值
array([0.97764813, 0.99807947, 0.79951814])
函数名称 | NaN安全版本 | 描述 |
---|---|---|
np.sum |
np.nansum |
计算元素的和 |
np.prod |
np.nanprod |
计算元素的积 |
np.mean |
np.nanmean |
计算元素的平均值 |
np.std |
np.nanstd |
计算元素的标准差 |
np.var |
np.nanvar |
计算元素的方差 |
np.min |
np.nanmin |
找出最小值 |
np.max |
np.nanmax |
找出最大值 |
np.argmin |
np.nanargmin |
找出最小值索引 |
np.argmax |
np.nanargmax |
找出最大值索引 |
np.median |
np.nanmedian |
计算元素的中位数 |
np.percentile |
np.nanpercentile |
计算基于元素排序的统计值 |
np.any |
N/A | 验证是否存在元素为真 |
np.all |
N/A | 验证所有元素是否为真 |
!head -4 PythonDataScienceHandbook-master/notebooks/data/president_heights.csv
order,name,height(cm)
1,George Washington,189
2,John Adams,170
3,Thomas Jefferson,189
import pandas as pd
data = pd.read_csv('PythonDataScienceHandbook-master/notebooks/data/president_heights.csv')
heights = np.array(data['height(cm)'])
print(heights)
[189 170 189 163 183 171 185 168 173 183 173 173 175 178 183 193 178 173
174 183 183 168 170 178 182 180 183 178 182 188 175 179 183 193 182 183
177 185 188 188 182 185]
print("平均身高: ", heights.mean())
print("标准偏差: ", heights.std())
print("最矮身高: ", heights.min())
print("最高身高: ", heights.max())
平均身高: 179.73809523809524
标准偏差: 6.931843442745892
最矮身高: 163
最高身高: 193
print("20% : ", np.percentile(heights, 25))
print("中位数: ", np.median(heights))
print("75% : ", np.percentile(heights, 75))
20% : 174.25
中位数: 182.0
75% : 183.0
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn;seaborn.set() # 设置绘图风格
plt.hist(heights)
plt.title('Height Distribution of US Presidents')
plt.xlabel('height (cm)')
plt.ylabel('number');
import numpy as np
a = np.array([0, 1, 2])
b = np.array([5, 5, 5])
a + b
array([5, 6, 7])
a + 5
array([5, 6, 7])
M = np.ones((3, 3))
M
array([[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.]])
M + a
array([[1., 2., 3.],
[1., 2., 3.],
[1., 2., 3.]])
a = np.arange(3)
b = np.arange(3)[:, np.newaxis]
print(a)
print(b)
[0 1 2]
[[0]
[1]
[2]]
a + b
array([[0, 1, 2],
[1, 2, 3],
[2, 3, 4]])
M = np.ones((2, 3))
a = np.arange(3)
print(M)
print(a)
[[1. 1. 1.]
[1. 1. 1.]]
[0 1 2]
M.shape = (2, 3)
a.shape = (3, )
M.shape → (2, 3)
a.shape → (1, 3)
M.shape → (2, 3)
a.shape → (2, 3)
M + a
array([[1., 2., 3.],
[1., 2., 3.]])
a = np.arange(3).reshape((3, 1))
b = np.arange(3)
print(a)
print(b)
[[0]
[1]
[2]]
[0 1 2]
a.shape = (3, 1)
b.shape = (3, )
a.shape → (3, 1)
b.shape → (1, 3)
a.shape → (3, 3)
b.shape → (3, 3)
a + b
array([[0, 1, 2],
[1, 2, 3],
[2, 3, 4]])
M = np.ones((3, 2))
a = np.arange(3)
print(M)
print(a)
[[1. 1.]
[1. 1.]
[1. 1.]]
[0 1 2]
M.shape = (3, 3)
a.shape = (3, )
M.shape → (3, 2)
a.shape → (1, 3)
M.shape → (3, 2)
a.shape → (3, 3)
M + a
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
----> 1 M + a
ValueError: operands could not be broadcast together with shapes (3,2) (3,)
X = np.random.random((10, 3))
X
array([[0.57225616, 0.49505128, 0.10433613],
[0.93963827, 0.47398203, 0.88429714],
[0.9831127 , 0.19311366, 0.43482586],
[0.08261148, 0.9079322 , 0.32549832],
[0.23780892, 0.79475765, 0.06536073],
[0.73393657, 0.43258879, 0.92306817],
[0.99624224, 0.30137717, 0.29146581],
[0.59269839, 0.20688911, 0.89979042],
[0.33540328, 0.37427399, 0.11429458],
[0.9528401 , 0.77827771, 0.30405051]])
# 计算均值
Xmean = X.mean(0)
Xmean
array([0.64265481, 0.49582436, 0.43469877])
X_centered = X - Xmean
X_centered
array([[-7.03986458e-02, -7.73076941e-04, -3.30362636e-01],
[ 2.96983464e-01, -2.18423290e-02, 4.49598370e-01],
[ 3.40457888e-01, -3.02710701e-01, 1.27092646e-04],
[-5.60043330e-01, 4.12107836e-01, -1.09200445e-01],
[-4.04845893e-01, 2.98933289e-01, -3.69338039e-01],
[ 9.12817608e-02, -6.32355688e-02, 4.88369399e-01],
[ 3.53587425e-01, -1.94447184e-01, -1.43232955e-01],
[-4.99564218e-02, -2.88935253e-01, 4.65091652e-01],
[-3.07251535e-01, -1.21550367e-01, -3.20404184e-01],
[ 3.10185288e-01, 2.82453355e-01, -1.30648256e-01]])
# 均值应该接近 0,考虑到精度,该均值为 0
X_centered.mean(0)
array([-2.22044605e-17, -4.44089210e-17, 0.00000000e+00])
# x、y表示 0~5 区间 50 个步长的序列
x = np.linspace(0, 5, 50)
y = np.linspace(0, 5, 50)[:, np.newaxis]
z = np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x)
%matplotlib inline
import matplotlib.pyplot as plt
plt.imshow(z, origin='lower', extent=[0, 5, 0, 5], cmap='viridis')
plt.colorbar()
import numpy as np
import pandas as pd
# 利用 pandas 读取数据,放进一个 numpy 数组
rainfall = pd.read_csv('PythonDataScienceHandbook-master/notebooks/data/Seattle2014.csv')['PRCP'].values
rainfall
array([ 0, 41, 15, 0, 0, 3, 122, 97, 58, 43, 213, 15, 0,
0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0,
0, 89, 216, 0, 23, 20, 0, 0, 0, 0, 0, 0, 51,
5, 183, 170, 46, 18, 94, 117, 264, 145, 152, 10, 30, 28,
25, 61, 130, 3, 0, 0, 0, 5, 191, 107, 165, 467, 30,
0, 323, 43, 188, 0, 0, 5, 69, 81, 277, 3, 0, 5,
0, 0, 0, 0, 0, 41, 36, 3, 221, 140, 0, 0, 0,
0, 25, 0, 46, 0, 0, 46, 0, 0, 0, 0, 0, 0,
5, 109, 185, 0, 137, 0, 51, 142, 89, 124, 0, 33, 69,
0, 0, 0, 0, 0, 333, 160, 51, 0, 0, 137, 20, 5,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38,
0, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 18, 64, 0, 5, 36, 13, 0,
8, 3, 0, 0, 0, 0, 0, 0, 18, 23, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 193, 0, 0, 0, 0,
0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0,
0, 5, 127, 216, 0, 10, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 84, 13, 0, 30, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5,
3, 0, 0, 0, 3, 183, 203, 43, 89, 0, 0, 8, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 74, 0, 76,
71, 86, 0, 33, 150, 0, 117, 10, 320, 94, 41, 61, 15,
8, 127, 5, 254, 170, 0, 18, 109, 41, 48, 41, 0, 0,
51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 152,
5, 119, 13, 183, 3, 33, 343, 36, 0, 0, 0, 0, 8,
30, 74, 0, 91, 99, 130, 69, 0, 0, 0, 0, 0, 28,
130, 30, 196, 0, 0, 206, 53, 0, 0, 33, 41, 0, 0,
0])
inches = rainfall / 254 # 1/10mm 单位换成 英寸
inches.shape
(365,)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set() # 设置绘图风格
plt.hist(inches, 40) # 数据均分为40个区间
# 下面显示第一个代表 40 个均分区间的计数,即 y轴 计数
# 第二个代表 40 个区间的 x轴 坐标
(array([245., 14., 13., 17., 8., 6., 5., 6., 4., 3., 7.,
6., 3., 3., 3., 4., 4., 2., 4., 0., 0., 1.,
1., 1., 0., 0., 0., 2., 1., 1., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 1.]),
array([0. , 0.04596457, 0.09192913, 0.1378937 , 0.18385827,
0.22982283, 0.2757874 , 0.32175197, 0.36771654, 0.4136811 ,
0.45964567, 0.50561024, 0.5515748 , 0.59753937, 0.64350394,
0.6894685 , 0.73543307, 0.78139764, 0.8273622 , 0.87332677,
0.91929134, 0.96525591, 1.01122047, 1.05718504, 1.10314961,
1.14911417, 1.19507874, 1.24104331, 1.28700787, 1.33297244,
1.37893701, 1.42490157, 1.47086614, 1.51683071, 1.56279528,
1.60875984, 1.65472441, 1.70068898, 1.74665354, 1.79261811,
1.83858268]),
)
x = np.array([1, 2, 3, 4, 5])
x < 3
array([ True, True, False, False, False])
x > 3
array([False, False, False, True, True])
x <= 3
array([ True, True, True, False, False])
x >= 3
array([False, False, True, True, True])
x != 3
array([ True, True, False, True, True])
x == 3
array([False, False, True, False, False])
(2 * x) == (x ** 2)
array([False, True, False, False, False])
运算符 | 对应的通用函数 |
---|---|
== |
np.equal |
!= |
np.not_equal |
< |
np.less |
<= |
np.less_equal |
> |
np.greater |
>= |
np.greater_equal |
rng = np.random.RandomState(0)
x = rng.randint(10, size=(3, 4))
x
array([[5, 0, 3, 3],
[7, 9, 3, 5],
[2, 4, 7, 6]])
x < 6
array([[ True, True, True, True],
[False, False, True, True],
[ True, True, False, False]])
print(x)
[[5 0 3 3]
[7 9 3 5]
[2 4 7 6]]
np.count_nonzero(x < 6)
8
我们看到8个数是小于6的。另外一种实现方式是利用 np.sum
np.sum(x < 6)
8
np.sum(x < 6, axis=1)
array([4, 2, 2])
如要快速检查任意或者所有这些值是否为 True,可以用 np.any() 或者 np.all()
# 有没有值大于8
np.any(x > 8)
True
np.any(x < 0)
False
# 是否所有值都小于10
np.all(x < 10)
True
np.all(x == 6)
False
np.all(x < 8, axis=1)
array([ True, False, True])
# 降雨量在 0.5~1.0 英寸之间的天数
np.sum((inches > 0.5) & (inches < 1))
29
如果上面不加括号,就变成下面
inches > (0.5 & inches) < 1
利用 A AND B 和 NOT(NOT A OR NOT B) 的等价原理
np.sum(~( (inches <= 0.5) | (inches >= 1 )))
29
运算符 | 对应通用函数 |
---|---|
& |
np.bitwise_and |
| | np.bitwise_or |
^ |
np.bitwise_xor |
~ |
np.bitwise_not |
print("Number days without rain :", np.sum(inches == 0))
print("Number days with rain :", np.sum(inches != 0))
print("Days with more than 0.5 inches:", np.sum(inches > 0.5))
print("Rainy days with < 0.2 inches :", np.sum((inches > 0) & (inches < 0.2)))
Number days without rain : 215
Number days with rain : 150
Days with more than 0.5 inches: 37
Rainy days with < 0.2 inches : 75
print(x)
[[5 0 3 3]
[7 9 3 5]
[2 4 7 6]]
x < 5
array([[False, True, True, True],
[False, False, True, False],
[ True, True, False, False]])
现在为了将这些值从数组中 选出 ,可以进行简单地索引,即 掩码 操作:
x[x < 5]
array([0, 3, 3, 3, 2, 4])
# 为所有下雨天创建一个掩码
rainy = (inches > 0)
# 构建一个包含整个夏季日期的掩码(6月21日是第172天)
days = np.arange(365)
summer = (days > 172) & (days < 262)
print("Median precip on rainy days in 2014 (inches): ", np.median(inches[rainy]))
print("Median precip on summer days in 2014 (inches): ", np.median(inches[summer]))
print("Maximum precip on summer days in 2014 (inches): ", np.max(inches[summer]))
print("Median precip on non-summer rainy days (inches):", np.median(inches[rainy & ~summer]))
Median precip on rainy days in 2014 (inches): 0.19488188976377951
Median precip on summer days in 2014 (inches): 0.0
Maximum precip on summer days in 2014 (inches): 0.8503937007874016
Median precip on non-summer rainy days (inches): 0.20078740157480315
import numpy as np
rand = np.random.RandomState(42)
x = rand.randint(100, size=10)
print(x)
[51 92 14 71 60 20 82 86 74 74]
[x[3], x[7], x[2]]
[71, 86, 14]
ind = [3, 7, 2]
x[ind]
array([71, 86, 14])
利用花哨的索引,结果的形状与 索引数组 的形状一致,而不是与 被索引数组 的形状一致:
ind = np.array([[3, 7],
[4, 5]])
x[ind]
array([[71, 86],
[60, 20]])
X = np.arange(12).reshape((3, 4))
X
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
X[row, col]
array([ 2, 5, 11])
X[row[:, np.newaxis], col]
array([[ 2, 1, 3],
[ 6, 5, 7],
[10, 9, 11]])
row[:, np.newaxis] * col
array([[0, 0, 0],
[2, 1, 3],
[4, 2, 6]])
0:2, 0:1, 0:3 -> 0, 0, 0
1:2, 1:1, 1:3 -> 2, 1, 3
2:2, 2:1, 2:3 -> 4, 2, 6
print(X)
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
X[2, [2, 0, 1]]
array([10, 8, 9])
X[1:, [2, 0, 1]]
array([[ 6, 4, 5],
[10, 8, 9]])
mask = np.array([1, 0, 1, 0], dtype=bool)
X[row[:, np.newaxis], mask]
array([[ 0, 2],
[ 4, 6],
[ 8, 10]])
花哨的索引的一个常见的用途是从一个矩阵中选择行的子集。
mean = [0, 0]
cov = [[1, 2],
[2, 5]]
X = rand.multivariate_normal(mean, cov, 100)
X.shape
(100, 2)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
plt.scatter(X[:, 0], X[:, 1])
indices = np.random.choice(X.shape[0], 20, replace=False)
indices
array([66, 4, 48, 21, 7, 90, 68, 26, 85, 8, 81, 55, 45, 3, 9, 54, 20,
6, 69, 19])
selection = X[indices]
selection.shape
(20, 2)
plt.scatter(X[:, 0], X[:, 1], alpha=0.3)
plt.scatter(selection[:, 0], selection[:, 1], facecolor='none', edgecolors='b', s=200)
x = np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
i = np.array([2, 1, 8, 4])
x[i] = 99
x
array([ 0, 99, 99, 3, 99, 5, 6, 7, 99, 9])
x = np.zeros(10)
x[[0, 0]] = [4, 6]
x
array([6., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
at(数组, 索引, 加几)
方法(1.8以后的版本可以使用)reduceat()
i = [2, 3, 3, 4, 4, 4]
x[i] += 1
x
array([6., 0., 1., 1., 1., 0., 0., 0., 0., 0.])
x = np.zeros(10)
np.add.at(x, i, 1)
x
array([0., 0., 1., 2., 3., 0., 0., 0., 0., 0.])
x = np.zeros(10)
np.add.at(x, i, 2)
x
array([0., 0., 2., 4., 6., 0., 0., 0., 0., 0.])
x = np.zeros(10)
np.add.at(x, i, [1, 1, 1, 1, 1, 1])
x
array([0., 0., 1., 2., 3., 0., 0., 0., 0., 0.])
ufunc.at
来计算。%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42) # 设置随机基准
x = np.random.randn(100) # randn函数返回一个或一组样本,具有标准正态分布。
# 手动计算直方图
bins = np.linspace(-5, 5, 20) # -5~5 均分20个区间
counts = np.zeros_like(bins) # 创建一个和 bind数组 shape一样的 全0数组
# 为每个x找到适合的区间
i = np.searchsorted(bins, x) # 返回值:x应该插入bind位置的索引
# 统计每个区间上的i
np.add.at(counts, i, 1)
# 画出结果
plt.plot(bins, counts, linestyle='steps')
~/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:19: MatplotlibDeprecationWarning: Passing the drawstyle with the linestyle as a single string is deprecated since Matplotlib 3.1 and support will be removed in 3.3; please pass the drawstyle separately using the drawstyle keyword argument to Line2D or set_drawstyle() method (or ds/set_ds()).
[]
plt.hist()
plt.hist(x, bins, histtype='step')
(array([ 0., 0., 0., 0., 1., 3., 7., 9., 23., 22., 17., 10., 7.,
1., 0., 0., 0., 0., 0.]),
array([-5. , -4.47368421, -3.94736842, -3.42105263, -2.89473684,
-2.36842105, -1.84210526, -1.31578947, -0.78947368, -0.26315789,
0.26315789, 0.78947368, 1.31578947, 1.84210526, 2.36842105,
2.89473684, 3.42105263, 3.94736842, 4.47368421, 5. ]),
)
print("NumPy routine:")
%timeit counts, edges = np.histogram(x, bins)
print("Custom routine:")
%timeit np.add.at(counts, np.searchsorted(bins, x), 1)
NumPy routine:
21.5 µs ± 241 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Custom routine:
11.9 µs ± 69.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
x = np.random.randn(10000000)
print("NumPy routine:")
%timeit counts, edges = np.histogram(x, bins)
print("Custom routine:")
%timeit np.add.at(counts, np.searchsorted(bins, x), 1)
NumPy routine:
564 ms ± 5.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Custom routine:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
in
4
5 print("Custom routine:")
----> 6 get_ipython().run_line_magic('timeit', 'np.add.at(counts, np.searchsorted(bins, x), 1)')
~/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line, _stack_depth)
2315 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2316 with self.builtin_trap:
-> 2317 result = fn(*args, **kwargs)
2318 return result
2319
in timeit(self, line, cell, local_ns)
~/opt/anaconda3/lib/python3.7/site-packages/IPython/core/magic.py in (f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
~/opt/anaconda3/lib/python3.7/site-packages/IPython/core/magics/execution.py in timeit(self, line, cell, local_ns)
1158 for index in range(0, 10):
1159 number = 10 ** index
-> 1160 time_number = timer.timeit(number)
1161 if time_number >= 0.2:
1162 break
~/opt/anaconda3/lib/python3.7/site-packages/IPython/core/magics/execution.py in timeit(self, number)
167 gc.disable()
168 try:
--> 169 timing = self.inner(it, self.timer)
170 finally:
171 if gcold:
in inner(_it, _timer)
IndexError: index 20 is out of bounds for axis 0 with size 20
选择排序
重复寻找列表中最小的值,并不断交换直到列表是有序的import numpy as np
def selection_sort(x):
for i in range(len(x)):
swap = i + np.argmin(x[i:])
(x[i], x[swap]) = (x[swap], x[i])
return x
x = np.array([2, 1, 4, 3, 5])
selection_sort(x)
array([1, 2, 3, 4, 5])
# 不改变原数组
x = np.array([2, 1, 4, 3, 5])
print(np.sort(x))
print(x)
[1 2 3 4 5]
[2 1 4 3 5]
# 原数组排序
x.sort()
print(x)
[1 2 3 4 5]
x = np.array([2, 1, 4, 3, 5])
i = np.argsort(x)
print(x)
print(i)
[2 1 4 3 5]
[1 0 3 2 4]
x[i]
array([1, 2, 3, 4, 5])
rand = np.random.RandomState(42)
X = rand.randint(0, 10, (4, 6))
print(X)
[[6 3 7 4 6 9]
[2 6 7 4 3 7]
[7 2 5 4 1 7]
[5 1 4 0 9 5]]
# 对 X 的每一列排序
np.sort(X, axis=0)
array([[2, 1, 4, 0, 1, 5],
[5, 2, 5, 4, 3, 7],
[6, 3, 7, 4, 6, 7],
[7, 6, 7, 4, 9, 9]])
# 对 X 的每一行排序
np.sort(X, axis=1)
array([[3, 4, 6, 6, 7, 9],
[2, 3, 4, 6, 7, 7],
[1, 2, 4, 5, 7, 7],
[0, 1, 4, 5, 5, 9]])
np.partition
函数提供了该方法,输入数组、数字K,输出结果是一个新数组,最左边是第 K 小的值,往右是任意顺序的其他值x = np.array([7, 2, 3, 1, 6, 5, 4])
np.partition(x, 3)
array([2, 1, 3, 4, 6, 5, 7])
# 多维数组
np.partition(X, 2, axis=1)
array([[3, 4, 6, 7, 6, 9],
[2, 3, 4, 7, 6, 7],
[1, 2, 4, 5, 7, 7],
[0, 1, 4, 5, 9, 5]])
argsort
函数沿着多个轴快速找到集合中的每个点的最近邻X = rand.rand(10, 2)
X
array([[0.00706631, 0.02306243],
[0.52477466, 0.39986097],
[0.04666566, 0.97375552],
[0.23277134, 0.09060643],
[0.61838601, 0.38246199],
[0.98323089, 0.46676289],
[0.85994041, 0.68030754],
[0.45049925, 0.01326496],
[0.94220176, 0.56328822],
[0.3854165 , 0.01596625]])
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
plt.scatter(X[:, 0], X[:, 1], s=100)
dist_sq = np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1)
dist_sq
array([[0. , 0.40999909, 0.90538547, 0.05550496, 0.50287983,
1.14976739, 1.15936537, 0.19672877, 1.16632222, 0.14319923],
[0.40999909, 0. , 0.55794316, 0.18090431, 0.00906581,
0.21465798, 0.19098635, 0.15497331, 0.20095384, 0.16679585],
[0.90538547, 0.55794316, 0. , 0.81458763, 0.67649219,
1.13419594, 0.74752753, 1.08562368, 0.9704683 , 1.03211241],
[0.05550496, 0.18090431, 0.81458763, 0. , 0.23387834,
0.70468321, 0.74108843, 0.05338715, 0.72671958, 0.0288717 ],
[0.50287983, 0.00906581, 0.67649219, 0.23387834, 0. ,
0.14021843, 0.1470605 , 0.16449241, 0.13755476, 0.18859392],
[1.14976739, 0.21465798, 1.13419594, 0.70468321, 0.14021843,
0. , 0.06080186, 0.48946337, 0.01100053, 0.56059965],
[1.15936537, 0.19098635, 0.74752753, 0.74108843, 0.1470605 ,
0.06080186, 0. , 0.61258786, 0.02046045, 0.66652228],
[0.19672877, 0.15497331, 1.08562368, 0.05338715, 0.16449241,
0.48946337, 0.61258786, 0. , 0.54429694, 0.00424306],
[1.16632222, 0.20095384, 0.9704683 , 0.72671958, 0.13755476,
0.01100053, 0.02046045, 0.54429694, 0. , 0.60957115],
[0.14319923, 0.16679585, 1.03211241, 0.0288717 , 0.18859392,
0.56059965, 0.66652228, 0.00424306, 0.60957115, 0. ]])
dist_sq_2 = np.sum((X[:, np.newaxis] - X[np.newaxis, :]) ** 2, axis=-1)
dist_sq_2
array([[0. , 0.40999909, 0.90538547, 0.05550496, 0.50287983,
1.14976739, 1.15936537, 0.19672877, 1.16632222, 0.14319923],
[0.40999909, 0. , 0.55794316, 0.18090431, 0.00906581,
0.21465798, 0.19098635, 0.15497331, 0.20095384, 0.16679585],
[0.90538547, 0.55794316, 0. , 0.81458763, 0.67649219,
1.13419594, 0.74752753, 1.08562368, 0.9704683 , 1.03211241],
[0.05550496, 0.18090431, 0.81458763, 0. , 0.23387834,
0.70468321, 0.74108843, 0.05338715, 0.72671958, 0.0288717 ],
[0.50287983, 0.00906581, 0.67649219, 0.23387834, 0. ,
0.14021843, 0.1470605 , 0.16449241, 0.13755476, 0.18859392],
[1.14976739, 0.21465798, 1.13419594, 0.70468321, 0.14021843,
0. , 0.06080186, 0.48946337, 0.01100053, 0.56059965],
[1.15936537, 0.19098635, 0.74752753, 0.74108843, 0.1470605 ,
0.06080186, 0. , 0.61258786, 0.02046045, 0.66652228],
[0.19672877, 0.15497331, 1.08562368, 0.05338715, 0.16449241,
0.48946337, 0.61258786, 0. , 0.54429694, 0.00424306],
[1.16632222, 0.20095384, 0.9704683 , 0.72671958, 0.13755476,
0.01100053, 0.02046045, 0.54429694, 0. , 0.60957115],
[0.14319923, 0.16679585, 1.03211241, 0.0288717 , 0.18859392,
0.56059965, 0.66652228, 0.00424306, 0.60957115, 0. ]])
print(dist_sq.shape)
print(dist_sq_2.shape)
(10, 10)
(10, 10)
X[:, np.newaxis, :]
array([[[0.00706631, 0.02306243]],
[[0.52477466, 0.39986097]],
[[0.04666566, 0.97375552]],
[[0.23277134, 0.09060643]],
[[0.61838601, 0.38246199]],
[[0.98323089, 0.46676289]],
[[0.85994041, 0.68030754]],
[[0.45049925, 0.01326496]],
[[0.94220176, 0.56328822]],
[[0.3854165 , 0.01596625]]])
X[np.newaxis, :, :]
array([[[0.00706631, 0.02306243],
[0.52477466, 0.39986097],
[0.04666566, 0.97375552],
[0.23277134, 0.09060643],
[0.61838601, 0.38246199],
[0.98323089, 0.46676289],
[0.85994041, 0.68030754],
[0.45049925, 0.01326496],
[0.94220176, 0.56328822],
[0.3854165 , 0.01596625]]])
X[:, np.newaxis, :].shape
(10, 1, 2)
X[np.newaxis, :, :].shape
(1, 10, 2)
X[:, np.newaxis]
array([[[0.00706631, 0.02306243]],
[[0.52477466, 0.39986097]],
[[0.04666566, 0.97375552]],
[[0.23277134, 0.09060643]],
[[0.61838601, 0.38246199]],
[[0.98323089, 0.46676289]],
[[0.85994041, 0.68030754]],
[[0.45049925, 0.01326496]],
[[0.94220176, 0.56328822]],
[[0.3854165 , 0.01596625]]])
X[np.newaxis, :]
array([[[0.00706631, 0.02306243],
[0.52477466, 0.39986097],
[0.04666566, 0.97375552],
[0.23277134, 0.09060643],
[0.61838601, 0.38246199],
[0.98323089, 0.46676289],
[0.85994041, 0.68030754],
[0.45049925, 0.01326496],
[0.94220176, 0.56328822],
[0.3854165 , 0.01596625]]])
X[:, np.newaxis].shape
(10, 1, 2)
X[np.newaxis, :].shape
(1, 10, 2)
nearest = np.argsort(dist_sq, axis=1)
nearest
array([[0, 3, 9, 7, 1, 4, 2, 5, 6, 8],
[1, 4, 7, 9, 3, 6, 8, 5, 0, 2],
[2, 1, 4, 6, 3, 0, 8, 9, 7, 5],
[3, 9, 7, 0, 1, 4, 5, 8, 6, 2],
[4, 1, 8, 5, 6, 7, 9, 3, 0, 2],
[5, 8, 6, 4, 1, 7, 9, 3, 2, 0],
[6, 8, 5, 4, 1, 7, 9, 3, 2, 0],
[7, 9, 3, 1, 4, 0, 5, 8, 6, 2],
[8, 5, 6, 4, 1, 7, 9, 3, 2, 0],
[9, 7, 3, 0, 1, 4, 5, 8, 6, 2]])
k = 2
nearest_partition = np.argpartition(dist_sq, k + 1, axis=1)
nearest_partition
array([[3, 0, 9, 7, 1, 4, 2, 5, 8, 6],
[1, 4, 7, 9, 3, 5, 6, 2, 8, 0],
[2, 1, 4, 6, 3, 0, 5, 7, 8, 9],
[3, 9, 7, 0, 1, 5, 6, 2, 8, 4],
[1, 8, 4, 5, 7, 6, 9, 3, 2, 0],
[5, 8, 6, 4, 1, 7, 9, 3, 2, 0],
[6, 8, 5, 4, 1, 7, 9, 3, 2, 0],
[7, 9, 3, 1, 4, 5, 6, 2, 8, 0],
[8, 5, 6, 4, 1, 7, 9, 3, 2, 0],
[3, 9, 7, 0, 1, 5, 6, 2, 8, 4]])
plt.scatter(X[:, 0], X[:, 1], s=100)
# 连接两个最近邻连接
k = 2
for i in range(X.shape[0]):
for j in nearest_partition[i, :k+1]:
# 画一条从 x[i]到x[j] 的线段
# 用 zip 方法实现
plt.plot(*zip(X[j], X[i]), color='black')
结构化数组
和 记录数组
,它们是复合的,异构的数据提供了非常有效的储存。import numpy as np
name = ['Alice', 'Bob', 'Cathy', 'Doug']
age = [25, 45, 37, 19]
weight = [55.0, 85.5, 68.0, 61.5]
# 与之类似的创建方法
x = np.zeros(4, dtype=int)
data = np.zeros(4, dtype={'names': ('name', 'age', 'weight'),
'formats': ('U10', 'i4', 'f8')})
data.dtype
dtype([('name', '
U10:长度不超过10的Unicode字符串
i4:4字节(即32比特)整型
f8:8字节(即64比特)浮点型
后面会介绍更多数据类型代码
现在把数据放进数组容器中
data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)
[('Alice', 25, 55. ) ('Bob', 45, 85.5) ('Cathy', 37, 68. )
('Doug', 19, 61.5)]
# 取出名字
data['name']
array(['Alice', 'Bob', 'Cathy', 'Doug'], dtype='
# 取出第一行数据
data[0]
('Alice', 25, 55.)
# 取出最后一个名字
data[-1]['name']
'Doug'
data['name'][-1]
'Doug'
# 通过掩码操作
# 取出年龄小于30岁的名字
data['name'][data['age'] < 30]
array(['Alice', 'Doug'], dtype='
data[data['age'] < 30]['name']
array(['Alice', 'Doug'], dtype='
np.dtype({'names': ('name', 'age', 'weight'),
'formats': ('U10', 'i4', 'f8')})
dtype([('name', '
np.dtype({'names': ('name', 'age', 'weight'),
'formats': ((np.str_, 10), int, np.float32)})
dtype([('name', '
np.dtype([('name', 'S10'), ('age', 'i4'), ('weight', 'f8')])
dtype([('name', 'S10'), ('age', '
np.dtype('S10,i4,f8')
dtype([('f0', 'S10'), ('f1', '
数据类型符号 | 描述 | 示例 |
---|---|---|
'b' |
字节型 | np.dtype('b') |
'i' |
有符号整型 | np.dtype('i4') == np.int32 |
'u' |
无符号整型 | np.dtype('u1') == np.uint8 |
'f' |
浮点型 | np.dtype('f8') == np.int64 |
'c' |
复数浮点型 | np.dtype('c16') == np.complex128 |
'S' , 'a' |
字符串 | np.dtype('S5') |
'U' |
Unicode 编码字符串 | np.dtype('U') == np.str_ |
'V' |
原生数据 (空) | np.dtype('V') == np.void |
tp = np.dtype([('id', 'i8'), ('mat', 'f8', (3, 3))])
X = np.zeros(1, dtype=tp)
print(X[0])
print(X['mat'][0])
(0, [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
[[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]]
np.recarray
类data['age']
array([25, 45, 37, 19], dtype=int32)
data_rec = data.view(np.recarray)
data_rec.age
array([25, 45, 37, 19], dtype=int32)
%timeit data['age']
%timeit data_rec['age']
%timeit data_rec.age
114 ns ± 0.333 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
1.96 µs ± 6.36 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
2.37 µs ± 9.44 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)