换源_python
!pip insall package_name -i https://pypi.douban.com/simple/
#从指定镜像下载安装工具包,镜像URL可自行修改
模型服务
keras 部署服务
import tensorflow as tf
## freeze traiing session
def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True):
## get session graph
graph = session.graph
with graph.as_default():
## remove training related nodes
freeze_var_names = list(set(v.op.name for v in tf.global_variables()).difference(keep_var_names or []))
output_names = output_names or []
output_names += [v.op.name for v in tf.global_variables()]
input_graph_def = graph.as_graph_def()
## remove device info if trained on gpu
if clear_devices:
for node in input_graph_def.node:
node.device = ""
frozen_graph = tf.graph_util.convert_variables_to_constants(
session, input_graph_def, output_names, freeze_var_names)
return frozen_graph
from keras import backend as K
frozen_graph = freeze_session(K.get_session(),
output_names=[out.op.name for out in model.outputs])
tf.train.write_graph(frozen_graph, "./", "model.pb", as_text=False)
可部署为服务的 Python 脚本
__name__ = 'model'
def func(input_object):
## your code here
return output_object
SQL操作
连接 PostgreSQL
import psycopg2
import pandas as pd
connection = psycopg2.connect(user = "username",
password = "password",
host = "127.0.0.1",
port = "5432",
database = "dn_name")
cursor = connection.cursor()
test_query = """SELECT subject_id, hadm_id, admittime, dischtime, admission_type, diagnosis
FROM admissions
"""
test = pd.read_sql_query(test_query, connection)
单表导出数据
### 从单表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### 用 order_by_list 字段进行降序排序
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
WHERE <where_condition_1> AND <where_condition_2> OR <where_condition_3>
HAVING <having_condition>
ORDER BY <order_by_list> [DESC]
"""
df = pd.read_sql_query(query_string, conn)
使用 WHERE 语句过滤
### 从单表中获取数据
### 使用 WHERE 语句进行过滤,包含了 IN 包含关系,和使用 LIKE 做模式匹配
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
WHERE column_name [NOT] IN ( value_1, value_2, ...,value_n)
AND column_name BETWEEN value_1 AND value_2
OR column_name LIKE 'string'
"""
df = pd.read_sql_query(query_string, conn)
使用 HAVING 语句过滤
### 从单表中获取数据
### 使用 HAVING 语句作过滤,包含聚合函数
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
HAVING [aggregation function] = value_1
AND [aggregation_function] = value_2
"""
df = pd.read_sql_query(query_string, conn)
取出前N条数据
### 从单表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### 用 order_by_list 字段进行降序排序
### 取出头部的N条数据
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list> [AS <alias_name>]
FROM <table_name>
WHERE <where_condition_1> AND <where_condition_2> OR <where_condition_3>
HAVING <having_condition>
ORDER BY <order_by_list> [DESC]
LIMIT <selected N>
"""
df = pd.read_sql_query(query_string, conn)
多表导出数据
### 从多表中获取数据
### 使用 INNER JOIN 从两表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [DISTINCT] <select_column_list>
FROM <left_table>
<join_type> JOIN <right_table>
ON <join_condition>
WHERE <where_condition>
HAVING <having_condition>
ORDER BY <order_by_list> DESC
LIMIT <limit_number>
"""
df = pd.read_sql_query(query_string, conn)
使用聚合函数
### 从多表中获取数据
### 使用 INNER JOIN 从两表中获取数据
### 使用 WHERE 和 HAVING 语句进行过滤
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT [aggregation function] (<column_name>)
FROM <left_table>
<join_type> JOIN <right_table>
ON <join_condition>
WHERE <where_condition>
GROUP BY <group_by_all_the_left_columns>
[HAVING <having_condition>]
[ORDER BY <order_by_list>]
[LIMIT <limit_number>]
"""
df = pd.read_sql_query(query_string, conn)
使用 Subquery
### 如果SQL Query过长过复杂,可把Subquery的结果进行调用
### conn 为 postgreSQL 的数据库连接对象
query_string = """--subquery作为一张表
SELECT <column_list> FROM (
SELECT <column_list> FROM table_name
) AS alias_table_name
"""
query_string_1 = """--subquery作为一系列值
SELECT <column_list> FROM <table_name>
WHERE <column_name> IN (
SELECT <column_name> FROM <table_name> WHERE <where_condition>
)
"""
df = pd.read_sql_query(query_string, conn)
使用 WITH 语句
### 如果SQL Query过长过复杂,可以将子查询的结果定义为表变量在后期复用
### conn 为 postgreSQL 的数据库连接对象
query_string = """WITH table_variable_1 AS (
<SELECT query>
),
table_variable_2 AS (
SELECT * FROM table_variable_1;
)
SELECT * FROM table_variable_2;
"""
df = pd.read_sql_query(query_string, conn)
通用条件表达式 (类似if-else)
### 条件语句
### conn 为 postgreSQL 的数据库连接对象
query_string = """-- 通用条件表达式 (类似if-else)
CASE
WHEN condition1 THEN result1 -- i.e WHEN count > 5 THEN 1
WHEN condition2 THEN result2 -- i.e. WHEN name = 'elle' THEN 'ELLE'
[...]
[ELSE result_n]
END
"""
df = pd.read_sql_query(query_string, conn)
查看数据库中所有表名
### 查看数据库中所有表名
### conn 为 postgreSQL 的数据库连接对象
query_string = """
SELECT * FROM pg_tables
WHERE schemaname <> 'pg_catalog' AND schemaname <> 'information_schema';
"""
df = pd.read_sql_query(query_string, conn)
窗口函数
### 窗口函数
### conn 为 postgreSQL 的数据库连接对象
query_string = """SELECT <<column_name>,
window_func() OVER ( [PARTITION BY xx] [ORDER BY xx] )
FROM <table_name>
"""
df = pd.read_sql_query(query_string, conn)
查看表内字段类型
### 查看表内字段类型
### conn 为 postgreSQL 的数据库连接对象
query_string = """
select column_name, data_type
from information_schema.columns
where table_name = <table_name>
"""
df = pd.read_sql_query(query_string, conn)
基本操作
装包换源-Python
!pip install package_name -i https://pypi.douban.com/simple/ #从指定镜像下载安装工具包,镜像URL可自行修改
静默安装-Python
!pip install package_name -q
描述性统计信息
皮尔森相关系数
import numpy as np
matrix = np.transpose(np.array(X))
np.corrcoef(matrix[0], matrix[1])[0, 1]
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.corrcoef.html
移动平均数
import numpy as np
ret = np.cumsum(np.array(X), dtype=float)
ret[w:] = ret[w:] - ret[:-w]
result = ret[w - 1:] / w
## X: array-like
## window: int
列的分位数
import pandas as pd
## set columns type
my_df['col'] = my_df['col'].astype(np.float64)
## computations for 4 quantiles : quartiles
bins_col = pd.qcut(my_df['col'], 4)
bins_col_label = pd.qcut(my_df['col'], 4).labels
多重聚合(组数据)
## columns settings
grouped_on = 'col_0' ## ['col_0', 'col_2'] for multiple columns
aggregated_column = 'col_1'
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice :: mean, median, sum, std, var, min, max, prod
### - group choice :: first, last, count
## list of functions to compute
agg_funcs = ['mean', 'max']
## compute aggregate values
aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
## get the aggregate of group
aggregated_values.ix[group]
以列值排序
import pandas as pd
## my_df 是pandas dataframe
my_df['col_0'].value_counts().sort_index()
用户定义方程(组数据)
## columns settings
grouped_on = ['col_0']
aggregated_columns = ['col_1']
def my_func(my_group_array):
return my_group_array.min() * my_group_array.count()
### list of functions to compute
agg_funcs = [my_func] ## could be many
## compute aggregate values
aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
标准差
import numpy as np
np.std(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html
在聚合的dataframe上使用用户定义方程
## top n in aggregate dataframe
def top_n(group_df, col, n=2):
bests = group_df[col].value_counts()[:n]
return bests
## columns settings
grouped_on = 'col_0'
aggregated_column = 'col'
grouped = my_df.groupby(grouped_on)
groups_top_n = grouped.apply(top_n, aggregated_column, n=3)
所有列(列的数据类型为数值型)
my_df.describe()
以频率降序排序
import pandas as pd
## my_df 是pandas dataframe
my_df['col_0'].value_counts()
一些列的单一属性(如最大值, 列的类型为数值型)
my_df["col"].max() ## [["col_0", "col_1"]] 多字段
最大互信息数
import numpy as np
matrix = np.transpose(np.array(X)).astype(float)
mine = MINE(alpha=0.6, c=15, est="mic_approx")
mic_result = []
for i in matrix[1:]:
mine.compute_score(t_matrix[0], i)
mic_result.append(mine.mic())
return mic_result
平均数
import numpy as np
np.average(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.average.html
组数据的基本信息
## columns settings
grouped_on = 'col_0' ## ['col_0', 'col_1'] for multiple columns
aggregated_column = 'col_1'
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice : mean, median, sum, std, var, min, max, prod
### - group choice : first, last, count
### On the group lines
### - size of the group : size
aggregated_values = my_df.groupby(grouped_on)[aggregated_column].mean()
aggregated_values.name = 'mean'
## get the aggregate of group
aggregated_values.ix[group]
行列数
## 获取当前dataframe的形状
nb_rows = my_df.shape[0]
nb_cols = my_df.shape[1]
相关性
my_df.corr()
中位数
import numpy as np
np.median(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.median.html
数据组的遍历
## columns settings
grouped_on = 'col_0' ## ['col_0', 'col_1'] for multiple columns
grouped = my_df.groupby(grouped_on)
i = 0
for group_name, group_dataframe in grouped:
if i > 10:
break
i += 1
print(i, group_name, group_dataframe.mean()) ### mean on all numerical columns
协方差
my_df.cov()
方差
import numpy as np
np.var(np.array(X))
## X: array-like
## https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.var.html
K平均数算法
import numpy as np
from sklearn.cluster import KMeans
k_means = KMeans(k).fit(np.array(X))
result = k_means.labels_
label = result.tolist()
return label, k, k_means.cluster_centers_.tolist(), k_means.inertia_
## k: int, k>=2
## X: array-like
## http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
所有列的单一属性(如最大值, 列的类型为数值型)
my_df.max()
获得类别字段的频数
my_df['category'].value_counts()
查看缺失情况
for col in my_df.columns:
print("column {} 包含 {} 个缺失值".format(col, my_df[col].isnull().sum()))
查看字段包含unique值的程度
for col in my_df.columns:
print("column {} 中共有 {} 个unique value".format(col, my_df[col].nunique()))
新建列
使用复杂的方程设值
import numpy as np
def complex_formula(col0_value, col1_value):
return "%s (%s)" % (col0_value, col1_value)
my_df['new_col'] = np.vectorize(complex_formula)(my_df['col_0'], my_df['col_1'])
标准聚合(使用groupby)
## columns settings
grouped_on = 'col_1'
aggregated_column = 'col_0'
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice : mean, median, sum, std, var, min, max, prod
### - group choice : first, last, count
my_df['aggregate_values_on_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(lambda v: v.mean())
使用自定义方程设值
def to_log(v):
try:
return log(v)
except:
return np.nan
my_df['new_col'] = my_df['col_0'].map(to_log)
用户定义的聚合方程(使用groupby)
def zscore(x):
return (x - x.mean()) / x.std()
my_df['zscore_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(zscore)
缺失值处理
用一个值填补多列的缺失值
my_df[['col_0', 'col_1']] = my_df[['col_0', 'col_1']].fillna(value)
用一个值填补一列的缺失值
my_df['col'] = my_df['col'].fillna(value)
去除在指定列中带有缺失值的数据条目
cols = ['col_0', 'col_1']
records_without_nas_in_cols = my_df.dropna(subset=cols)
用最后一个或缺失值的下一个数据填补缺失值
## - ffill : propagate last valid observation forward to next valid
## - backfill : use NEXT valid observation to fill gap
my_df['col'] = my_df['col'].fillna(method='ffill')
用一个由聚合得出的值填补缺失值
grouped_on = 'col_0' ## ['col_1', 'col_1'] ## for multiple columns
#### Choice of aggregate functions
### On non-NA values in the group
### - numeric choice : mean, median, sum, std, var, min, max, prod
### - group choice : first, last, count
def filling_function(v):
return v.fillna(v.mean())
my_df['col'] = my_df.groupby(grouped_on)['col'].transform(filling_function)
以列为单位审查数据集
my_df.info()
去除所有任何缺失值的数据条目
records_without_nas = my_df.dropna()
Pandas基本操作
计算频率来转置表
freqs = my_df.pivot_table(
rows=["make"],
cols=["fuel_type", "aspiration"],
margins=True ## add subtotals on rows and cols
)
水平连接数据
import pandas as pd
two_dfs_hconcat = pd.concat([my_df, my_df2], axis=1)
将dataframe的index重置成标准值
my_df_no_index = my_df.reset_index()
## my_df_no_index.index is now [0, 1, 2, ...]
以元组(tuple)形式按行写入数据
import pandas as pd
py_recipe_output = pd.read_csv("data.csv")
writer = py_recipe_output.get_writer()
## t is of the form :
## (value0, value1, ...)
for t in data_to_write:
writer.write_tuple(t)
数据重构