Python 实用宝典

repeat

$ conda install --file requirements.txt

$ conda create -n PDSH python=3.5 --file requirements.txt

# conda
conda install pandas
# or PyPI
pip install pandas
pip install cython
python setup.py install
python -m pip install -e . --no-build-isolation --no-use-pep517
python setup.py develop
$ conda install --file requirements.txt

$ conda create -n PDSH python=3.5 --file requirements.txt

import pandas as pd

mydict = {'A': [1], 'B': [2, 3], 'C': [4, 5, 6]}
pd.DataFrame(mydict.items()).explode(1)
mydict.items()
dict_items([('A', [1]), ('B', [2, 3]), ('C', [4, 5, 6])])
pd.DataFrame(mydict.items())
select a,b_i from df lateral view explode(b) tmp as b_i;
result = []
for k, vs in mydict.items():
    for v in vs:
        result.append((k, v))
pd.DataFrame(result)
import itertools
result = []
for k, v in mydict.items():
    result.extend(itertools.product(k, v))
pd.DataFrame(result)
product(*iterables, repeat=1) --> product object
result = [(k, v) for k, vs in mydict.items() for v in vs]
pd.DataFrame(result)
df = pd.DataFrame(mydict.items(), columns=["a", "b"])
df
df["b"].apply(pd.Series)
df.set_index("a")["b"].apply(pd.Series)
df["b"].apply(pd.Series).set_index(df["a"])
df.agg({"a": lambda x: x, "b": pd.Series})
df.agg({"a": lambda x: x, "b": pd.Series}).droplevel(0, axis=1)
df.agg({"a": lambda x: x, "b": pd.Series}).droplevel(0, axis=1).set_index("a")
df = pd.DataFrame.from_dict(mydict, 'index')
df = pd.DataFrame(data=mydict.values(), index=mydict.keys())
df = df.rename_axis(index="a").reset_index()
df
df.melt(id_vars='a', value_name='b')
df = pd.DataFrame.from_dict(mydict, 'index')
df = df.melt(id_vars='a', value_name='b').drop(columns="variable").dropna()
df.b = df.b.astype("int")
df
df = pd.DataFrame.from_dict(mydict, 'index')
df.stack()
A  0    1.0
B  0    2.0
   1    3.0
C  0    4.0
   1    5.0
   2    6.0
dtype: float64
df.stack().droplevel(1)
A    1.0
B    2.0
B    3.0
C    4.0
C    5.0
C    6.0
dtype: float64
df.stack().droplevel(1).reset_index()
df.stack().droplevel(1).reset_index().set_axis(["a", "b"], axis=1)
df.b = df.b.astype("int")
df = pd.DataFrame.from_dict(mydict, 'index')
df = df.stack().droplevel(1).reset_index().set_axis(["a", "b"], axis=1)
df.b = df.b.astype("int")
df
import pandas as pd
pd.set_option("display.max_colwidth", 100)

df = pd.read_excel("正则提取与分列.xlsm", usecols=[0])
df.head()

result = df.copy()
result["tmp"] = result["补回原因"].str.findall("([\d.]+[到至][\d.]+)")
result = result.agg({"补回原因": lambda x: x, "tmp": pd.Series}).droplevel(0, axis=1)
result.head()

df["tmp"] = df["补回原因"].str.findall("([\d.]+[到至][\d.]+)")
df.head(5)

df.agg({"补回原因": lambda x: x, "tmp": pd.Series})

df.agg({"补回原因": lambda x: x, "tmp": pd.Series}).droplevel(0, axis=1).head()

df = pd.DataFrame([
...     [1, 2, 3, 4],
...     [5, 6, 7, 8],
...     [9, 10, 11, 12]
... ]).set_index([0, 1]).rename_axis(['a', 'b'])
>>> df.columns = pd.MultiIndex.from_tuples([
...    ('c', 'e'), ('d', 'f')
... ], names=['level_1', 'level_2'])
>>> df
level_1   c   d
level_2   e   f
a b
1 2      3   4
5 6      7   8
9 10    11  12
>>> df.droplevel('a')
level_1   c   d
level_2   e   f
b
2        3   4
6        7   8
10      11  12
>>> df.droplevel('level2', axis=1)
level_1   c   d
a b
1 2      3   4
5 6      7   8
9 10    11  12

df = pd.read_excel("分组聚合并分列.xlsx")
df

(
    df.groupby("姓名")["得分"]
    .apply(list)
    .apply(pd.Series)
    .fillna("")
    .rename(columns=lambda x: f"得分{x+1}")
    .reset_index()
    .astype({"得分1":"int8"})
)

df.groupby("姓名")["得分"].apply(list)

姓名
孙四娘          [7, 28]
看见星光    [88, 28, 23]
看见月光    [69, 10, 87]
老祝          [51, 29]
马青梅             [99]
Name: 得分, dtype: object

df.groupby("姓名")["得分"].apply(lambda x:x.to_list())

_.apply(pd.Series)

_.fillna("")

_.rename(columns=lambda x: f"得分{x+1}")

_.reset_index()

_.astype({"得分1":"int8"})

df = pd.read_excel("字典分列.xlsx")
df.head()

result = df.features.apply(eval).apply(pd.Series)
result["counts"] = df.counts
result

import pandas as pd
data = pd.read_excel('测试.xlsx')

data.head()

html_table = data.to_html('测试.html')

print(data.to_html())

html_table = data.to_html('测试.html',header = True,index = False,justify='center')

import pandas as pd

product = pd.read_excel('sample.xlsx', sheet_name='A')
cost = pd.read_excel('sample.xlsx', sheet_name='B')

fi_cost = cost.set_index(['地区代码','地区缩写']).stack().reset_index()
result = pd.merge(product, fi_cost, on='地区代码', how='left')
result.columns = ['产品ID', '地区代码', '重量', '地区缩写', '重量区间', '价格']
result[['最低区间', '最高区间']] = result['重量区间'].str.split('~', expand=True).astype(float)
result.query("最低区间<=`重量`<=最高区间")

import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

product = pd.read_excel('sample.xlsx', sheet_name='A')
cost = pd.read_excel('sample.xlsx', sheet_name='B')

product.head()
cost.head()

fi_cost = cost.melt(id_vars=["地区代码", "地区缩写"], var_name="重量区间", value_name='价格')
fi_cost

fi_cost.重量区间 = fi_cost.重量区间.str.split("~").str[1].astype("float")
fi_cost.sort_values(["地区代码", "重量区间"], inplace=True, ignore_index=True)
fi_cost.head(10)

fi_cost_g = fi_cost.groupby("地区代码")
for product_id, area_id, weight in product.values:
    print(product_id, area_id, weight)
    cost_table = fi_cost_g.get_group(area_id)
    display(cost_table)
    break

fi_cost_g = fi_cost.groupby("地区代码")[["地区缩写", "重量区间", "价格"]]
for product_id, area_id, weight in product.values:
    print(product_id, area_id, weight)
    cost_table = fi_cost_g.get_group(area_id)
    display(cost_table)
    for area, weight_cost, price in cost_table.values:
        if weight <= weight_cost:
            print(area, price)
            break
    break

result = []
fi_cost_g = fi_cost.groupby("地区代码")[["地区缩写", "重量区间", "价格"]]
for product_id, area_id, weight in product.values:
    cost_table = fi_cost_g.get_group(area_id)
    for area, weight_cost, price in cost_table.values:
        if weight <= weight_cost:
            break
    result.append((product_id, area_id, area, weight, price))
result = pd.DataFrame(result, columns=["产品ID", "地区代码", "地区缩写", "重量（kg）", "价格"])
result

import pandas as pd

product = pd.read_excel('sample.xlsx', sheet_name='A')
cost = pd.read_excel('sample.xlsx', sheet_name='B')

fi_cost = cost.melt(id_vars=["地区代码", "地区缩写"], var_name="重量区间", value_name='价格')
fi_cost.重量区间 = fi_cost.重量区间.str.split("~").str[1].astype("float")
fi_cost.sort_values(["地区代码", "重量区间"], inplace=True, ignore_index=True)
result = []
fi_cost_g = fi_cost.groupby("地区代码")[["地区缩写", "重量区间", "价格"]]
for product_id, area_id, weight in product.values:
    cost_table = fi_cost_g.get_group(area_id)
    for area, weight_cost, price in cost_table.values:
        if weight <= weight_cost:
            break
    result.append((product_id, area_id, area, weight, price))
result = pd.DataFrame(result, columns=["产品ID", "地区代码", "地区缩写", "重量（kg）", "价格"])
result
import pandas as pd

product = pd.read_excel('sample.xlsx', sheet_name='A')
cost = pd.read_excel('sample.xlsx', sheet_name='B')
cost.head()

price_range = cost.columns[2:].str.split("~").str[1].astype("float").tolist()
price_range

[0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 7.0, 10.0, 15.0, 100000.0]

import bisect
import numpy as np

for a in np.linspace(0.5, 5, 10):
    idx = bisect.bisect_left(price_range, a)
    print(a, idx)

0.5 0
1.0 1
1.5 2
2.0 2
2.5 3
3.0 3
3.5 4
4.0 4
4.5 5
5.0 5

print(*enumerate(price_range))

(0, 0.5) (1, 1.0) (2, 2.0) (3, 3.0) (4, 4.0) (5, 5.0) (6, 7.0) (7, 10.0) (8, 15.0) (9, 100000.0)

cost_dict = {}
for area_id, area, *prices in cost.values:
for idx, price in enumerate(prices):
        cost_dict[(area_id, idx)] = area, price

result = []
for product_id, area_id, weight in product.values:
    idx = bisect.bisect_left(price_range, weight)
    area, price = cost_dict[(area_id, idx)]
    result.append((product_id, area_id, area, weight, price))
result = pd.DataFrame(result, columns=["产品ID", "地区代码", "地区缩写", "重量（kg）", "价格"])
result

import pandas as pd
import bisect

product = pd.read_excel('sample.xlsx', sheet_name='A')
cost = pd.read_excel('sample.xlsx', sheet_name='B')
price_range = cost.columns[2:].str.split("~").str[1].astype("float").tolist()
cost_dict = {}
for area_id, area, *prices in cost.values:
for idx, price in enumerate(prices):
        cost_dict[(area_id, idx)] = area, price
result = []
for product_id, area_id, weight in product.values:
    idx = bisect.bisect_left(price_range, weight)
    area, price = cost_dict[(area_id, idx)]
    result.append((product_id, area_id, area, weight, price))
result = pd.DataFrame(result, columns=["产品ID", "地区代码", "地区缩写", "重量（kg）", "价格"])
result

import pandas as pd
import bisect

product = pd.read_excel('sample.xlsx', sheet_name='A')
cost = pd.read_excel('sample.xlsx', sheet_name='B')
price_range = cost.columns[2:].str.split("~").str[1].astype("float").tolist()
cost.columns = ["地区代码", "地区缩写"]+list(range(cost.shape[1]-2))
cost = cost.melt(id_vars=["地区代码", "地区缩写"],
                       var_name='idx', value_name='运费')
product["idx"] = product["重量（kg）"].apply(
lambda weight: bisect.bisect_left(price_range, weight))
result = pd.merge(product, cost, on=['地区代码', 'idx'], how='left')
result.drop(columns=["idx"], inplace=True)
result

conda install geopandas
>>> import geopandas
>>> from shapely.geometry import Polygon
>>> p1 = Polygon([(0, 0), (1, 0), (1, 1)])
>>> p2 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
>>> p3 = Polygon([(2, 0), (3, 0), (3, 1), (2, 1)])
>>> g = geopandas.GeoSeries([p1, p2, p3])
>>> g
0    POLYGON ((0 0, 1 0, 1 1, 0 0))
1    POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))
2    POLYGON ((2 0, 3 0, 3 1, 2 1, 2 0))
dtype: geometry
>>> print(g.area)
0    0.5
1    1.0
2    1.0
dtype: float64
>>> g.plot()
import matplotlib.pyplot as plt
g.plot()
plt.savefig("test.png")
import geopandas
import matplotlib.pyplot as plt
from shapely.geometry import Polygon
maps = geopandas.read_file('1.shx')
# 读取的数据格式类似于
#                                             geometry
# 0   POLYGON ((1329152.341 5619034.278, 1323327.591...
# 1   POLYGON ((-2189253.375 4611401.367, -2202922.3...
# 2   POLYGON ((761692.092 4443124.843, 760999.873 4...
# 3   POLYGON ((-34477.046 4516813.963, -41105.128 4...
# ... ...
maps.plot()
plt.savefig("test.png")
print(maps.area)
# 0     4.156054e+11
# 1     1.528346e+12
# 2     1.487538e+11
# 3     4.781135e+10
# 4     1.189317e+12
# 5     1.468277e+11
# 6     1.597052e+11
# 7     9.770609e+10
# 8     1.385692e+11
# 9     1.846538e+11
# 10    1.015979e+11
# ... ...

	储存条件	品牌	推荐理由	品种	食用方式	是否进口	特色服务	是否有机	counts
0	常温	NaN	NaN	NaN	NaN	NaN	NaN	NaN	33
1	冷藏	NaN	NaN	NaN	NaN	NaN	NaN	NaN	24
2	常温	禾煜	NaN	NaN	NaN	NaN	NaN	NaN	22
3	常温	妙洁	NaN	NaN	NaN	NaN	NaN	NaN	16
4	冷冻	NaN	NaN	NaN	NaN	NaN	NaN	NaN	14
…	…	…	…	…	…	…	…	…	…
2083	常温	乐事	够薄够脆	NaN	NaN	NaN	NaN	NaN	1
2084	冷藏	NaN	生态种植	黄瓜	NaN	NaN	NaN	有机	1
2085	冷藏	NaN	腥味较淡	鲫鱼	NaN	NaN	免费宰杀	NaN	1
2086	冷藏	NaN	甜脆可口	佛手瓜	NaN	NaN	NaN	NaN	1
2087	冷藏	叮咚日日鲜	全程可追溯	猪小排	NaN	NaN	NaN	NaN	1

如何使用本书

关于

软件

许可证

代码

文本

Pandas：功能强大的Python数据分析工具包

那是什么？

主要功能

在哪里买到它？

依赖项

从源安装

许可证

文档

背景

获取帮助

研讨与发展

Python Data Science Handbook

如何使用本书

关于

软件

许可证

代码

文本

1.问题来源

2.解法

2.1 基础解法explode函数

2.2 没有exlode函数如何解决这个问题

2.3 循环法解题

2.4 变形法解题

2.4.1 列表分列的2种方法

对Series进行列表分列

直接对Datafream进行列表分列

2.4.2 将字典的键作为索引的2种读取方法

2.4.3 melt实现逆透视

2.4.4 stack实现逆透视

2.实际应用

正则提取并分列

分组聚合并分列

解析json字符串并字典分列

读取Excel

生成Html

调整格式

小结

1.简单案例讲解

Pandas案例需求

解题思路

人海战术

解构战术

具体实现

2.复杂一点的情况

Pandas案例需求

上文的简化写法

顺序查找匹配

3.优化方案

字典查找+二分查找高效匹配

两种算法的性能对比

将非等值连接转换为等值连接

1.准备

2.基本使用

3.绘制并算出每个省的面积

有趣好用的Python教程