数据分析/数据科学Python常用代码块

换源_python

!pip insall package_name -i https://pypi.douban.com/simple/ 
#从指定镜像下载安装工具包,镜像URL可自行修改

模型服务

keras 部署服务

import tensorflow as tf
## freeze traiing session
def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True):
 ## get session graph
 graph = session.graph
 with graph.as_default():
 ## remove training related nodes
 freeze_var_names = list(set(v.op.name for v in tf.global_variables()).difference(keep_var_names or []))
 output_names = output_names or []
 output_names += [v.op.name for v in tf.global_variables()]
 input_graph_def = graph.as_graph_def()
 ## remove device info if trained on gpu
 if clear_devices:
 for node in input_graph_def.node:
 node.device = ""
 frozen_graph = tf.graph_util.convert_variables_to_constants(
 session, input_graph_def, output_names, freeze_var_names)
 return frozen_graph
 
from keras import backend as K
frozen_graph = freeze_session(K.get_session(),
 output_names=[out.op.name for out in model.outputs])
tf.train.write_graph(frozen_graph, "./", "model.pb", as_text=False)

可部署为服务的 Python 脚本

__name__ = 'model'
def func(input_object):
 ## your code here
 return output_object

SQL操作

连接 PostgreSQL

import psycopg2
import pandas as pd
connection = psycopg2.connect(user = "username",
 password = "password",
 host = "127.0.0.1",
 port = "5432",
 database = "dn_name")
cursor = connection.cursor()
test_query = """SELECT subject_id, hadm_id, admittime, dischtime, admission_type, diagnosis
FROM admissions
"""
test = pd.read_sql_query(test_query, connection)

单表导出数据

### 从单表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### 用 order_by_list 字段进行降序排序
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
WHERE <where_condition_1> AND <where_condition_2> OR <where_condition_3>
HAVING <having_condition>
ORDER BY <order_by_list> [DESC]
"""
df = pd.read_sql_query(query_string, conn)

使用 WHERE 语句过滤

### 从单表中获取数据
### 使用 WHERE 语句进行过滤,包含了 IN 包含关系,和使用 LIKE 做模式匹配
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
WHERE column_name [NOT] IN ( value_1, value_2, ...,value_n)
AND column_name BETWEEN value_1 AND value_2
OR column_name LIKE 'string'
"""
df = pd.read_sql_query(query_string, conn)

使用 HAVING 语句过滤

### 从单表中获取数据
### 使用 HAVING 语句作过滤,包含聚合函数
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
HAVING [aggregation function] = value_1
AND [aggregation_function] = value_2
"""
df = pd.read_sql_query(query_string, conn)

取出前N条数据

### 从单表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### 用 order_by_list 字段进行降序排序
### 取出头部的N条数据
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
WHERE <where_condition_1> AND <where_condition_2> OR <where_condition_3>
HAVING <having_condition>
ORDER BY <order_by_list> [DESC]
LIMIT <selected N>
"""
df = pd.read_sql_query(query_string, conn)

多表导出数据

### 从多表中获取数据
### 使用 INNER JOIN 从两表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list>
FROM <left_table>
<join_type> JOIN <right_table>
ON <join_condition> 
WHERE <where_condition>
HAVING <having_condition>
ORDER BY <order_by_list> DESC
LIMIT <limit_number>
"""
df = pd.read_sql_query(query_string, conn)

使用聚合函数

### 从多表中获取数据
### 使用 INNER JOIN 从两表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [aggregation function] (<column_name>)
FROM <left_table>
<join_type> JOIN <right_table>
ON <join_condition>
WHERE <where_condition>
GROUP BY <group_by_all_the_left_columns>
[HAVING <having_condition>]
[ORDER BY <order_by_list>]
[LIMIT <limit_number>]
"""
df = pd.read_sql_query(query_string, conn)

使用 Subquery

### 如果SQL Query过长过复杂,可把Subquery的结果进行调用
### conn 为 postgreSQL 的数据库连接对象
query_string = """--subquery作为一张表
SELECT <column_list> FROM ( 
 SELECT <column_list> FROM table_name
) AS alias_table_name
"""
query_string_1 = """--subquery作为一系列值
SELECT <column_list> FROM <table_name>
WHERE <column_name> IN (
 SELECT <column_name> FROM <table_name> WHERE <where_condition>
)
"""
df = pd.read_sql_query(query_string, conn)

使用 WITH 语句

### 如果SQL Query过长过复杂,可以将子查询的结果定义为表变量在后期复用
### conn 为 postgreSQL 的数据库连接对象
query_string = """WITH table_variable_1 AS (
 <SELECT query>
),
table_variable_2 AS (
 SELECT * FROM table_variable_1;
)
SELECT * FROM table_variable_2;
"""
df = pd.read_sql_query(query_string, conn)

通用条件表达式 (类似if-else)

### 条件语句
### conn 为 postgreSQL 的数据库连接对象
query_string = """-- 通用条件表达式 (类似if-else)
CASE 
WHEN condition1 THEN result1 -- i.e WHEN count > 5 THEN 1
WHEN condition2 THEN result2 -- i.e. WHEN name = 'elle' THEN 'ELLE'
[...]
[ELSE result_n]
END 
"""
df = pd.read_sql_query(query_string, conn)

查看数据库中所有表名

### 查看数据库中所有表名
### conn 为 postgreSQL 的数据库连接对象
query_string = """
SELECT * FROM pg_tables
WHERE schemaname <> 'pg_catalog' AND schemaname <> 'information_schema'; 
"""
df = pd.read_sql_query(query_string, conn)

窗口函数

### 窗口函数
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT <<column_name>,
 window_func() OVER ( [PARTITION BY xx] [ORDER BY xx] )
FROM <table_name>
"""
df = pd.read_sql_query(query_string, conn)

查看表内字段类型

### 查看表内字段类型
### conn 为 postgreSQL 的数据库连接对象
query_string = """
select column_name, data_type 
from information_schema.columns 
where table_name = <table_name>
"""
df = pd.read_sql_query(query_string, conn)

基本操作

装包换源-Python

!pip install package_name -i https://pypi.douban.com/simple/ #从指定镜像下载安装工具包,镜像URL可自行修改

静默安装-Python

!pip install package_name -q

描述性统计信息

皮尔森相关系数

import numpy as np 
matrix = np.transpose(np.array(X))
np.corrcoef(matrix[0], matrix[1])[0, 1]
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.corrcoef.html

移动平均数

import numpy as np
ret = np.cumsum(np.array(X), dtype=float)
ret[w:] = ret[w:] - ret[:-w]
result = ret[w - 1:] / w
## X: array-like
## window: int

列的分位数

import pandas as pd
## set columns type
my_df['col'] = my_df['col'].astype(np.float64)
## computations for 4 quantiles : quartiles
bins_col = pd.qcut(my_df['col'], 4)
bins_col_label = pd.qcut(my_df['col'], 4).labels

多重聚合(组数据)

## columns settings
grouped_on = 'col_0' ## ['col_0', 'col_2'] for multiple columns
aggregated_column = 'col_1'
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice :: mean, median, sum, std, var, min, max, prod
### - group choice :: first, last, count
## list of functions to compute
agg_funcs = ['mean', 'max']
## compute aggregate values
aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
## get the aggregate of group
aggregated_values.ix[group]

以列值排序

import pandas as pd
## my_df 是pandas dataframe
my_df['col_0'].value_counts().sort_index()

用户定义方程(组数据)

## columns settings
grouped_on = ['col_0']
aggregated_columns = ['col_1']
def my_func(my_group_array):
 return my_group_array.min() * my_group_array.count()
### list of functions to compute
agg_funcs = [my_func] ## could be many
## compute aggregate values
aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)

标准差

import numpy as np
np.std(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html

在聚合的dataframe上使用用户定义方程

## top n in aggregate dataframe
def top_n(group_df, col, n=2):
 bests = group_df[col].value_counts()[:n]
 return bests
## columns settings
grouped_on = 'col_0'
aggregated_column = 'col'
grouped = my_df.groupby(grouped_on)
groups_top_n = grouped.apply(top_n, aggregated_column, n=3)

所有列(列的数据类型为数值型)

my_df.describe()

以频率降序排序

import pandas as pd
## my_df 是pandas dataframe
my_df['col_0'].value_counts()

一些列的单一属性(如最大值, 列的类型为数值型)

my_df["col"].max() ## [["col_0", "col_1"]] 多字段

最大互信息数

import numpy as np
matrix = np.transpose(np.array(X)).astype(float)
mine = MINE(alpha=0.6, c=15, est="mic_approx")
mic_result = []
for i in matrix[1:]:
 mine.compute_score(t_matrix[0], i)
 mic_result.append(mine.mic())
return mic_result

平均数

import numpy as np
np.average(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.average.html

组数据的基本信息

## columns settings
grouped_on = 'col_0' ## ['col_0', 'col_1'] for multiple columns
aggregated_column = 'col_1'
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice : mean, median, sum, std, var, min, max, prod
### - group choice : first, last, count
### On the group lines
### - size of the group : size
aggregated_values = my_df.groupby(grouped_on)[aggregated_column].mean()
aggregated_values.name = 'mean'
## get the aggregate of group
aggregated_values.ix[group]

行列数

## 获取当前dataframe的形状
nb_rows = my_df.shape[0]
nb_cols = my_df.shape[1]

相关性

my_df.corr()

中位数

import numpy as np
np.median(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.median.html

数据组的遍历

## columns settings
grouped_on = 'col_0' ## ['col_0', 'col_1'] for multiple columns
grouped = my_df.groupby(grouped_on)
i = 0
for group_name, group_dataframe in grouped:
 if i > 10:
 break
 i += 1
 print(i, group_name, group_dataframe.mean()) ### mean on all numerical columns

协方差

my_df.cov()

方差

import numpy as np
np.var(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.var.html

K平均数算法

import numpy as np
from sklearn.cluster import KMeans
k_means = KMeans(k).fit(np.array(X))
result = k_means.labels_
label = result.tolist()
return label, k, k_means.cluster_centers_.tolist(), k_means.inertia_
## k: int, k>=2
## X: array-like
## http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

所有列的单一属性(如最大值, 列的类型为数值型)

my_df.max()

获得类别字段的频数

my_df['category'].value_counts()

查看缺失情况

for col in my_df.columns:
 print("column {} 包含 {} 个缺失值".format(col, my_df[col].isnull().sum()))

查看字段包含unique值的程度

for col in my_df.columns:
 print("column {} 中共有 {} 个unique value".format(col, my_df[col].nunique()))

新建列

使用复杂的方程设值

import numpy as np
def complex_formula(col0_value, col1_value):
 return "%s (%s)" % (col0_value, col1_value)
my_df['new_col'] = np.vectorize(complex_formula)(my_df['col_0'], my_df['col_1'])

标准聚合(使用groupby)

## columns settings
grouped_on = 'col_1'
aggregated_column = 'col_0'
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice : mean, median, sum, std, var, min, max, prod
### - group choice : first, last, count
my_df['aggregate_values_on_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(lambda v: v.mean())

使用自定义方程设值

def to_log(v):
 try:
 return log(v)
 except:
 return np.nan
my_df['new_col'] = my_df['col_0'].map(to_log)

用户定义的聚合方程(使用groupby)

def zscore(x):
 return (x - x.mean()) / x.std()
 
my_df['zscore_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(zscore)

缺失值处理

用一个值填补多列的缺失值

my_df[['col_0', 'col_1']] = my_df[['col_0', 'col_1']].fillna(value)

用一个值填补一列的缺失值

my_df['col'] = my_df['col'].fillna(value)

去除在指定列中带有缺失值的数据条目

cols = ['col_0', 'col_1']
records_without_nas_in_cols = my_df.dropna(subset=cols)

用最后一个或缺失值的下一个数据填补缺失值

## - ffill : propagate last valid observation forward to next valid
## - backfill : use NEXT valid observation to fill gap
my_df['col'] = my_df['col'].fillna(method='ffill')

用一个由聚合得出的值填补缺失值

grouped_on = 'col_0' ## ['col_1', 'col_1'] ## for multiple columns
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice : mean, median, sum, std, var, min, max, prod
### - group choice : first, last, count
def filling_function(v):
 return v.fillna(v.mean())
 
my_df['col'] = my_df.groupby(grouped_on)['col'].transform(filling_function)

以列为单位审查数据集

my_df.info()

去除所有任何缺失值的数据条目

records_without_nas = my_df.dropna()

Pandas基本操作

计算频率来转置表

freqs = my_df.pivot_table(
 rows=["make"],
 cols=["fuel_type", "aspiration"],
 margins=True ## add subtotals on rows and cols
)

水平连接数据

import pandas as pd
two_dfs_hconcat = pd.concat([my_df, my_df2], axis=1)

将dataframe的index重置成标准值

my_df_no_index = my_df.reset_index()
## my_df_no_index.index is now [0, 1, 2, ...]

以元组(tuple)形式按行写入数据

import pandas as pd
py_recipe_output = pd.read_csv("data.csv")
writer = py_recipe_output.get_writer()
## t is of the form :
## (value0, value1, ...)
for t in data_to_write:
 writer.write_tuple(t)

数据重构