机器学习Pandas_learn3

from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
                    "最低报价":[numpy.nan,9.80,numpy.nan],
                    "最高报价":[49.80,23.10,58.78]}
goods_in=DataFrame(paints,index=[1,2,3])
print(goods_in)
goods_in_nonull=goods_in.dropna(axis=1)
print(goods_in_nonull)
      车名  最低报价   最高报价
1  奥迪Q5L   NaN  49.80
2   哈弗H6   9.8  23.10
3  奔驰GLC   NaN  58.78
      车名   最高报价
1  奥迪Q5L  49.80
2   哈弗H6  23.10
3  奔驰GLC  58.78
from pandas import DataFrame
kindergarten1={"小朋友数目":{"1班":32,"2班":20},
               "小朋友睡床":{"1班":40,"2班":30},
               "上课教室":{"1班":3,"2班":2}}
kindergarten2={"小朋友数目":{"1班":10,"2班":21,"3班":15},
               "小朋友睡床":{"1班":11,"2班":21,"3班":16},
               "上课教室":{"1班":1,"2班":2,"3班":2}}
kindergarten_dataframe1=DataFrame(kindergarten1)
kindergarten_dataframe2=DataFrame(kindergarten2)
kindergarten_all=kindergarten_dataframe1+kindergarten_dataframe2
print(kindergarten_all)
    小朋友数目  小朋友睡床  上课教室
1班   42.0   51.0   4.0
2班   41.0   51.0   4.0
3班    NaN    NaN   NaN
from pandas import DataFrame,Series
kindergarten1={"小朋友数目":[32,20],
               "小朋友睡床":[40,30],
               "上课教室":[3,2]}
kindergarten2={"小朋友数目":16,
               "小朋友睡床":19,
               "上课教室":2}
kindergarten_dataframe1=DataFrame(kindergarten1)
kindergarten_series1=Series(kindergarten2)
kindergarten_all=kindergarten_dataframe1+kindergarten_series1
print(kindergarten_all)
   小朋友数目  小朋友睡床  上课教室
0     48     59     5
1     36     49     4
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[numpy.nan,9.80,numpy.nan],
        "最高报价":[49.80,numpy.nan,58.78]}
goods_in=DataFrame(paints,index=[1,2,3])
goods_in_isnull=goods_in.isnull()
print(goods_in_isnull)
      车名   最低报价   最高报价
1  False   True  False
2  False  False   True
3  False   True  False
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[numpy.nan,9.80,numpy.nan],
        "最高报价":[49.80,23.10,58.78]}
goods_in=DataFrame(paints,index=[1,2,3])
goods_in_nonull=goods_in.fillna(10)
print(goods_in_nonull)
      车名  最低报价   最高报价
1  奥迪Q5L  10.0  49.80
2   哈弗H6   9.8  23.10
3  奔驰GLC  10.0  58.78
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[numpy.nan,9.80,numpy.nan],
        "最高报价":[49.80,23.10,numpy.nan]}
goods_in=DataFrame(paints,index=[1,2,3])
goods_in_fill=goods_in.fillna({"最低报价":10,"最高报价":20})
print(goods_in_fill)
      车名  最低报价  最高报价
1  奥迪Q5L  10.0  49.8
2   哈弗H6   9.8  23.1
3  奔驰GLC  10.0  20.0
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[9.80,numpy.nan,15.42],
        "最高报价":[49.80,23.10,numpy.nan]}
goods_in=DataFrame(paints,index=[1,2,3])
goods_in_fill=goods_in.fillna(method="ffill")
print(goods_in_fill)
      车名   最低报价  最高报价
1  奥迪Q5L   9.80  49.8
2   哈弗H6   9.80  23.1
3  奔驰GLC  15.42  23.1
from pandas import DataFrame
import numpy as np

# 定义汽车信息字典
paints = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [9.80, np.nan, 15.42],
    "最高报价": [49.80, 23.10, np.nan]
}

# 创建DataFrame
goods_in = DataFrame(paints, index=[1, 2, 3])

# 计算各列的中位数,去除NaN值后计算
medians = goods_in[["最低报价", "最高报价"]].dropna().median()

# 使用中位数填充缺失值
goods_in_fill = goods_in.fillna(medians)

# 打印填充后的DataFrame
print(goods_in_fill)
      车名   最低报价  最高报价
1  奥迪Q5L   9.80  49.8
2   哈弗H6   9.80  23.1
3  奔驰GLC  15.42  49.8
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
"最低报价":[9.80,14.35,15.42,9.80,14.35],
"最高报价":[49.80,23.10,60.45,49.80,23.10]}
goods_in=DataFrame(paints)
goods_in_duplicated=goods_in.duplicated()
print(goods_in_duplicated)


# 代码中对DataFrame结构的描述车数据调用duplicated()方法进行
# 重复值的查找,如果有重复值,重复的值就会输出为True
0    False
1    False
2    False
3     True
4     True
dtype: bool
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
"最低报价":[9.80,14.35,15.42,9.80,14.35],
"最高报价":[49.80,23.10,60.45,49.80,23.10]}
goods_in=DataFrame(paints)
goods_in_duplicated=goods_in.drop_duplicates()
print(goods_in_duplicated)
      车名   最低报价   最高报价
0  奥迪Q5L   9.80  49.80
1   哈弗H6  14.35  23.10
2  奔驰GLC  15.42  60.45
from pandas import DataFrame
import numpy
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
        "最低报价":[9.80,14.35,15.42,9.80,14.35],
        "最高报价":[49.80,23.10,60.45,49.80,23.10]}
goods_in=DataFrame(paints)
goods_in_duplicated=goods_in.drop_duplicates(["车名","最低报价","最高报价"],keep="last")
print(goods_in_duplicated)

# 代码使用drop_duplicates()方法对“车名”“最低报价”“最高
# 报价”3个维度中的重复数据采用keep="last"参数保留最后一个重复
# 项。
      车名   最低报价   最高报价
2  奔驰GLC  15.42  60.45
3  奥迪Q5L   9.80  49.80
4   哈弗H6  14.35  23.10
from pandas import DataFrame
import numpy as np
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
        "最低报价":[9.80,14.35,15.42,9.80,np.nan],
        "最高报价":[49.80,23.45,np.nan,49.80,23.10]}
goods_in=DataFrame(paints)
goods_in_replace=goods_in.replace(np.nan,20.50)
print(goods_in_replace)
      车名   最低报价   最高报价
0  奥迪Q5L   9.80  49.80
1   哈弗H6  14.35  23.45
2  奔驰GLC  15.42  20.50
3  奥迪Q5L   9.80  49.80
4   哈弗H6  20.50  23.10
from pandas import DataFrame
import numpy as np
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
    "最低报价":[9.80,14.35,15.42,0,np.nan],
    "最高报价":[0,23.45,np.nan,49.80,23.10]}
goods_in=DataFrame(paints)
goods_in_replace=goods_in.replace({np.nan:20.50,0:25.47})
print(goods_in_replace)

# 代码中replace()方法传入一个字典,字典的键分别是np.nan和
# 0,也就意味着DataFrame数据中的np.nan数据和0数据都将被替换成别
# 的数据,np.nan替换成对应的键的值20.50,0替换成对应的键的值
# 25.47
      车名   最低报价   最高报价
0  奥迪Q5L   9.80  25.47
1   哈弗H6  14.35  23.45
2  奔驰GLC  15.42  20.50
3  奥迪Q5L  25.47  49.80
4   哈弗H6  20.50  23.10
import numpy as np
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
    "最低报价":[9.80,14.35,15.42,0,12.35],
    "最高报价":[0,23.45,26.47,49.80,23.10]}
goods_in=DataFrame(paints,index=[0,1,2,3,4])
goods_in_permutation=np.random.permutation(goods_in)
print(goods_in_permutation)
[['哈弗H6' 12.35 23.1]
 ['哈弗H6' 14.35 23.45]
 ['奔驰GLC' 15.42 26.47]
 ['奥迪Q5L' 0.0 49.8]
 ['奥迪Q5L' 9.8 0.0]]
from pandas import DataFrame
import numpy as np
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奥迪Q5L","哈弗H6"],
    "最低报价":[9.80,14.35,15.42,0,12.35],
    "最高报价":[0,23.45,26.47,49.80,23.10]}
goods_in=DataFrame(paints,index=[0,1,2,3,4])
goods_in_permutation=goods_in.take(np.random.permutation(len(goods_in)))
print(goods_in_permutation)
      车名   最低报价   最高报价
4   哈弗H6  12.35  23.10
0  奥迪Q5L   9.80   0.00
3  奥迪Q5L   0.00  49.80
2  奔驰GLC  15.42  26.47
1   哈弗H6  14.35  23.45
# 从pandas库中导入DataFrame类
from pandas import DataFrame
# 创建一个字典paints,其中包含三个键值对
# 键"车名"对应的值是一个包含三款汽车名称的列表
# 键"最低报价"对应的值是一个包含三款汽车最低报价的列表
# 键"最高报价"对应的值是一个包含三款汽车最高报价的列表
paints = {"车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
           "最低报价": [38.78, 9.80, 39.48], 
           "最高报价": [49.80, 14.10, 58.78]}
# 使用DataFrame类将字典paints转换为一个DataFrame对象
# 并指定行索引为[1, 2, 3]
goods_in = DataFrame(paints, index=[1, 2, 3])
# 定义一个匿名函数f,用于对输入的数据进行最小-最大归一化处理
# 归一化公式为:(x - x的最小值) / (x的最大值 - x的最小值)
f = lambda x: (x - x.min()) / (x.max() - x.min())
# 对DataFrame对象goods_in中的"最低报价"和"最高报价"两列应用函数f进行归一化处理
# 并将处理后的结果重新赋值给这两列
goods_in[["最低报价", "最高报价"]] = goods_in[["最低报价", "最高报价"]].apply(f)
# 打印处理后的DataFrame对象goods_in
print(goods_in)



      车名      最低报价      最高报价
1  奥迪Q5L  0.976415  0.799015
2   哈弗H6  0.000000  0.000000
3  奔驰GLC  1.000000  1.000000
from pandas import DataFrame
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[38.78,9.80,39.48],
        "最高报价":[49.80,14.10,58.78]}
goods_in=DataFrame(paints,index=["L车","K车","D车"])
goods_in=goods_in.sort_index()
print(goods_in)
       车名   最低报价   最高报价
D车  奔驰GLC  39.48  58.78
K车   哈弗H6   9.80  14.10
L车  奥迪Q5L  38.78  49.80
from pandas import DataFrame
goods_in=DataFrame([["奥迪Q5L",38.78,49.80],["哈弗H6",9.80,58.78],["奔驰GLC",14.10,39.48]],
index=["L车","K车","D车"],columns=["names","low_price","high_price"])
goods_in=goods_in.sort_index(axis=1)
print(goods_in)
    high_price  low_price  names
L车       49.80      38.78  奥迪Q5L
K车       58.78       9.80   哈弗H6
D车       39.48      14.10  奔驰GLC
from pandas import DataFrame
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[38.78,9.80,39.48],
        "最高报价":[49.80,14.10,58.78]}
goods_in=DataFrame(paints,index=["L车","K车","D车"])
goods_in=goods_in.sort_index(ascending=False)
print(goods_in)
       车名   最低报价   最高报价
L车  奥迪Q5L  38.78  49.80
K车   哈弗H6   9.80  14.10
D车  奔驰GLC  39.48  58.78
from pandas import DataFrame
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[38.78,9.80,39.48],
        "最高报价":[49.80,14.10,58.78]}
goods_in=DataFrame(paints,index=[1,2,3])
goods_in=goods_in.sort_values(by="最低报价")
print(goods_in)
      车名   最低报价   最高报价
2   哈弗H6   9.80  14.10
1  奥迪Q5L  38.78  49.80
3  奔驰GLC  39.48  58.78
from pandas import DataFrame
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC"],
        "最低报价":[38.78,9.80,39.48],
        "最高报价":[49.80,14.10,58.78]}
goods_in=DataFrame(paints,index=[1,2,3])
print(goods_in)
goods_in=goods_in.rank()
print(goods_in)

# 车名列:虽然 车名 是字符串类型,但 rank() 方法会按照索引顺序排名。哈弗H6 对应索引 2,排名为 1.0;奔驰GLC 对应索引 3,排名为 2.0;奥迪Q5L 对应索引 1,排名为 3.0。
# 最低报价列:哈弗H6 的最低报价 9.80 是最小的,所以排名为 1.0;奥迪Q5L 的最低报价 38.78 次之,排名为 2.0;奔驰GLC 的最低报价 39.48 最大,排名为 3.0。
# 最高报价列:哈弗H6 的最高报价 14.10 最小,排名为 1.0;奥迪Q5L 的最高报价 49.80 次之,排名为 2.0;奔驰GLC 的最高报价 58.78 最大,排名为 3.0。
      车名   最低报价   最高报价
1  奥迪Q5L  38.78  49.80
2   哈弗H6   9.80  14.10
3  奔驰GLC  39.48  58.78
    车名  最低报价  最高报价
1  3.0   2.0   2.0
2  1.0   1.0   1.0
3  2.0   3.0   3.0
import pandas as pd

# 创建数据字典
paints = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78]
}

# 创建 DataFrame 对象,并指定行索引
goods_in = pd.DataFrame(paints, index=[1, 2, 3])

# 只选择数值列(最低报价和最高报价)进行按列排名
numeric_columns = ["最低报价", "最高报价"]
goods_in[numeric_columns] = goods_in[numeric_columns].rank()

# 打印结果
print(goods_in)
      车名  最低报价  最高报价
1  奥迪Q5L   2.0   2.0
2   哈弗H6   1.0   1.0
3  奔驰GLC   3.0   3.0
from pandas import DataFrame
paints={"车名":["奥迪Q5L","哈弗H6","奔驰GLC","奔驰GLC","奥迪Q5L"],
    "最低报价":[38.78,9.80,39.48,39.48,38.78],
    "最高报价":[49.80,14.10,58.78,58.78,49.80]}
goods_in=DataFrame(paints,index=["一辆车","一辆车","一辆车","一辆车","一辆车"])
goods_in_unique=goods_in.index.is_unique
print(goods_in_unique)
goods_in_value=goods_in.index.unique()
print(goods_in_value)
False
Index(['一辆车'], dtype='object')
from pandas import DataFrame
paints={"地址":["北京市","大兴区","黄村镇","卫星城"],
"购物车内每件商品价格":[38.78,9.80,39.48,39.48]}
goods_in=DataFrame(paints)
goods_sum=goods_in.sum()
print(goods_sum)
地址            北京市大兴区黄村镇卫星城
购物车内每件商品价格          127.54
dtype: object
import pandas as pd
import numpy as np

def calculate_total_purchases(data_dict):
    """
    此函数用于将输入的字典数据转换为 DataFrame,并计算每行的总和
    :param data_dict: 包含会员购买信息的字典
    :return: 每行的总和
    """
    try:
        # 创建 DataFrame 对象
        df = pd.DataFrame(data_dict)
        
        # 计算每行的总和,跳过 NaN 值
        row_sums = df.select_dtypes(include=[np.number]).sum(axis=1, skipna=True)
        return row_sums
    except Exception as e:
        print(f"计算过程中出现错误: {e}")
        return None

# 会员购买信息字典
paints = {
    "会员名": ["小王", "小李", "小张", "小凤"],
    "苹果": [5, 4, 3, np.nan],
    "橘子": [4, 2, 1, 2],
    "石榴": [3, 1, 1, np.nan]
}

# 调用函数计算每行总和
goods_sum = calculate_total_purchases(paints)

if goods_sum is not None:
    print(goods_sum)


# 按列累加  5 + 4 + 3
#          4 + 2 + 1
0    12.0
1     7.0
2     5.0
3     2.0
dtype: float64
import pandas as pd
import numpy as np

def calculate_total_purchases(data_dict):
    """
    该函数用于根据输入的会员购买信息字典,计算每个会员购买商品的总数量。
    :param data_dict: 包含会员名和各商品购买数量的字典
    :return: 包含每个会员购买商品总数量的 Series 对象
    """
    try:
        # 将字典转换为 DataFrame
        df = pd.DataFrame(data_dict)
        # 选取除会员名之外的商品数量列
        quantity_columns = df.drop(columns=['会员名'])
        # 按行求和,忽略缺失值
        total_purchases = quantity_columns.sum(axis=1, skipna=False)
        return total_purchases
    except KeyError as ke:
        print(f"数据字典中缺少必要的列: {ke}")
    except Exception as e:
        print(f"发生未知错误: {e}")

# 定义会员购买信息字典
paints = {
    "会员名": ["小王", "小李", "小张", "小凤"],
    "苹果": [5, 4, 3, np.nan],
    "橘子": [4, 2, 1, 2],
    "石榴": [3, 1, 1, np.nan]
}

# 调用函数计算总购买量
result = calculate_total_purchases(paints)
if result is not None:
    print(result)
0    12.0
1     7.0
2     5.0
3     NaN
dtype: float64
from pandas import DataFrame
import numpy as np
paints={"会员名":["小王","小李","小张","小凤"],
          "苹果":[5,4,3,np.nan],
          "橘子":[4,2,1,2],
          "石榴":[3,1,1,np.nan]}
goods_in=DataFrame(paints)
goods_sum=goods_in[["苹果","橘子","石榴"]].cumsum()
print(goods_sum)
     苹果  橘子   石榴
0   5.0   4  3.0
1   9.0   6  4.0
2  12.0   7  5.0
3   NaN   9  NaN
from pandas import DataFrame
import numpy as np
paints={"会员名":["小王","小李","小张","小凤"],
          "苹果":[5,4,3,np.nan],
          "橘子":[4,2,1,2],
          "石榴":[3,1,1,np.nan]}
goods_in=DataFrame(paints)
goods_sum=goods_in.describe()
print(goods_sum)
        苹果        橘子        石榴
count  3.0  4.000000  3.000000
mean   4.0  2.250000  1.666667
std    1.0  1.258306  1.154701
min    3.0  1.000000  1.000000
25%    3.5  1.750000  1.000000
50%    4.0  2.000000  1.000000
75%    4.5  2.500000  2.000000
max    5.0  4.000000  3.000000

你可能感兴趣的:(机器学习,pandas)