import numpy as np
import pandas as pd


data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
data


transform = lambda x: x[:3].upper()
#display(data.columns.map(transform))
display(data.index.map(transform))
display(data.index)
display(data)

Index(['OHI', 'COL', 'NEW'], dtype='object')

Index(['Ohio', 'Colorado', 'New York'], dtype='object')


data.index = data.index.map(transform)
data


data.rename(index=str.title, columns=str.upper)


data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'})


ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]


#18~25, 26~35, 36~60, 60이상

bins = [18,26,36,60,100] # 자를 경계
cats = pd.cut(ages,bins) # right=True(default) : (,]
cats

[(18, 26], (18, 26], (18, 26], (26, 36], (18, 26], ..., (26, 36], (60, 100], (36, 60], (36, 60], (26, 36]]
Length: 12
Categories (4, interval[int64]): [(18, 26] < (26, 36] < (36, 60] < (60, 100]]


# 나눈 계층값을 0~n으로 지정
# 0~n 까지의 값으로 변경
cats.codes # 0~3

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)


cats.categories

IntervalIndex([(18, 26], (26, 36], (36, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')


pd.value_counts(cats)

(18, 26]     5
(36, 60]     3
(26, 36]     3
(60, 100]    1
dtype: int64


# right=False : [,)
cats = pd.cut(ages,[18,26,36,61,100], right=False)
cats

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]


group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']


data = np.random.rand(20)
print(data)
pd.cut(data, 4, precision=2, labels=['lowest','low','high','highest'])

[0.56843069 0.30254945 0.49730879 0.68326291 0.91669867 0.10892895
 0.49549179 0.23283593 0.43686066 0.75154299 0.48089213 0.79772841
 0.28270293 0.43341824 0.00975735 0.34079598 0.68927201 0.86936929
 0.26780382 0.45674792]

['high', 'low', 'high', 'high', 'highest', ..., 'low', 'high', 'highest', 'low', 'low']
Length: 20
Categories (4, object): ['lowest' < 'low' < 'high' < 'highest']


data = pd.DataFrame(np.random.randn(1000,4))
data.head()


data.describe()


col = data[2]
col[np.abs(col) > 3]

197    3.039147
416   -3.146325
Name: 2, dtype: float64


# ★
data[ (np.abs(data)>3) .any(1) ]
#data[ (np.abs(data)>3) ].dropna(thresh=1).fillna(0)#.max()#.min()#.count()#.mean()#.std()
#data[ (np.abs(data)>3) ].dropna(thresh=1).isnull()


import numpy as np
import pandas as pd
from pandas import DataFrame

np.random.seed(123)
df = DataFrame({'col_1': np.random.randint(20, size=20), 'col_2': np.random.randn(20)})
df

factor_col_1 = pd.cut(df.col_1, 4)
factor_col_1

grouped_col_1 = df.col_1.groupby(factor_col_1)
display(grouped_col_1) # SeriesGroupBy
grouped_col_1.agg(['count', 'mean', 'std', 'min', 'max'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0C47CF10>


def summary_func(group):
    return {'count': group.count(), 'mean': group.mean(), 'std': group.std(), 'min': group.min(), 'max': group.max()}

grouped_col_1.apply(summary_func)

col_1                
(-0.019, 4.75]  count     8.000000
                mean      1.125000
                std       1.457738
                min       0.000000
                max       4.000000
(4.75, 9.5]     count     2.000000
                mean      7.500000
                std       2.121320
                min       6.000000
                max       9.000000
(9.5, 14.25]    count     4.000000
                mean     12.750000
                std       1.892969
                min      10.000000
                max      14.000000
(14.25, 19.0]   count     6.000000
                mean     17.000000
                std       1.788854
                min      15.000000
                max      19.000000
Name: col_1, dtype: float64


test = grouped_col_1.apply(summary_func).unstack()
test.unstack().shape
test


# np.abs(data)가 3보다 큰 애들을 3으로 고정시킴
data[np.abs(data)>3] = np.sign(data)*3
data.describe()


np.sign(data).head()


df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
display(np.random.permutation(df)) # inplace=False <-> np.random.shuffle() - inplace=True
display(df)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])


sampler = np.random.permutation(5)#(4)
sampler

array([2, 4, 3, 0, 1])


df.take(sampler)#, axis=1)


display(df.sample(n=3, random_state=1))
display(df.sample(n=3, axis=1))
df.sample(n=3)


choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

1    7
2   -1
2   -1
2   -1
1    7
3    6
2   -1
3    6
1    7
1    7
dtype: int64


df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df


pd.get_dummies(df['key'])


dummies = pd.get_dummies(df['key'], prefix='key')
dummies


#df_with_dummy = df['data1'].join(dummies) # error:'Series' object has no attribute 'join'
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy


# ★★
display(df[['data1']])
display(df['data1'])

0    0
1    1
2    2
3    3
4    4
5    5
Name: data1, dtype: int64


test = dummies.join(df['data1'])
test


mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('files/movies.dat', sep='::', header=None, names=mnames) # header=None > affect movie_id
movies[:10]


movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 60.7+ KB


all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)


# genres를 unique했으므로 genres에 따라 movies 목록을 뽑아내는 작업
zero_matrix = np.zeros((len(movies),len(genres)))
dummies = pd.DataFrame(zero_matrix, columns = genres)
dummies


gen = movies.genres[0]
gen.split('|') # ['Animation', "Children's", 'Comedy']
dummies.columns.get_indexer(gen.split('|')) # ['Animation'=0, "Children's"=1, 'Comedy'=2]

array([0, 1, 2], dtype=int32)


for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1
dummies


movies_windic = movies.join(dummies.add_prefix('Genre_')) # add_suffix
#movies_windic.iloc[0]
movies_windic


# set_option을 할 경우 지금 DataFrame뿐 아니라 이후에 출력되는 DataFrame도 영향이 가므로 주의
pd.set_option('display.max_columns',6)
#pd.set_option('display.max_rows',2)
pd.set_option('display.max_rows',2)
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.head()


np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

bins = [0,0.2,0.4,0.6,0.8,1]

display(pd.cut(values, bins))
display(pd.get_dummies(pd.cut(values, bins)))

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]


val = 'a,b, guide'
val.split(',')

['a', 'b', ' guide']


# 공백 사라짐
pieces = list(x.strip() for x in val.split(','))
pieces

['a', 'b', 'guide']


# unpacking
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guide'


'::'.join(pieces)

'a::b::guide'


'guide' in val

True


val.index(',')

1


# 없으면 음수값 반환
val.find(':')

-1


val.count(",")

2


val.replace(',', '::')

'a::b:: guide'


val.replace(',', '')

'ab guide'

	0	1	2	3
0	0.738600	-0.507776	0.227991	-1.740421
1	-0.534505	-0.937749	-1.486301	-0.029363
2	-1.237142	0.075986	-1.945198	-0.234840
3	-1.814567	0.059636	-0.397631	0.267998
4	0.890952	0.025341	-0.598142	-1.230504

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.011442	0.013205	0.028648	0.008134
std	1.026796	0.973519	0.998007	1.038934
min	-3.170364	-3.784642	-3.146325	-3.411796
25%	-0.708385	-0.627578	-0.661498	-0.713016
50%	0.012511	0.015492	0.050225	0.004004
75%	0.746445	0.658925	0.703221	0.760504
max	2.938272	3.190904	3.039147	3.375219

	0	1	2	3
19	0.758383	-3.784642	-1.219263	-0.369502
59	0.753059	0.858625	0.241787	-3.411796
197	1.045804	1.145826	3.039147	-0.489863
222	-0.660346	-0.732768	1.989074	3.344321
238	-2.610590	-0.620662	0.026520	-3.256768
388	-3.170364	0.079760	0.077850	0.877214
416	0.059133	0.389149	-3.146325	0.954444
445	0.991565	3.190904	0.057445	0.359985
466	0.784626	-1.549977	0.370359	3.238048
840	-3.130360	1.642567	-0.398491	0.523445
857	-0.084178	0.578835	1.869249	3.375219
915	-1.351265	1.018378	0.208455	-3.198062

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.011141	0.013799	0.028755	0.008043
std	1.025897	0.970161	0.997434	1.033362
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.708385	-0.627578	-0.661498	-0.713016
50%	0.012511	0.015492	0.050225	0.004004
75%	0.746445	0.658925	0.703221	0.760504
max	2.938272	3.000000	3.000000	3.000000

	0	1	2	3
0	1.0	-1.0	1.0	-1.0
1	-1.0	-1.0	-1.0	-1.0
2	-1.0	1.0	-1.0	-1.0
3	-1.0	1.0	-1.0	1.0
4	1.0	1.0	-1.0	-1.0

PROGRAMMING

PROGRAMMING

Python 8 본문

Python 8

`mapping transform`¶

`배열 > 카테고리 조건분류`¶

`One-Hot Encoding` (전처리)¶

`.dat file`¶

`get_dummies` (one-hot encoding)¶

`split, strip`¶

index, find, count, replace¶

'Python > Basic' 카테고리의 다른 글

티스토리툴바

	count	mean	std	min	max
col_1
(-0.019, 4.75]	8	1.125	1.457738	0	4
(4.75, 9.5]	2	7.500	2.121320	6	9
(9.5, 14.25]	4	12.750	1.892969	10	14
(14.25, 19.0]	6	17.000	1.788854	15	19

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3878	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3879	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3880	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3881	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3882	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

Python 10 (0)	2021.01.07
Python 9 (0)	2021.01.07
Python 7 (0)	2020.12.28
Python 6 (0)	2020.12.28
Python 5 (0)	2020.12.27

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

	one	two	three	four
OHI	0	1	2	3
COL	4	5	6	7
NEW	8	9	10	11

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

PROGRAMMING

Python 8 본문

Python 8

mapping transform¶

배열 > 카테고리 조건분류¶

One-Hot Encoding (전처리)¶

.dat file¶

get_dummies (one-hot encoding)¶

split, strip¶

index, find, count, replace¶

'Python > Basic' 카테고리의 다른 글

티스토리툴바

`mapping transform`¶

`배열 > 카테고리 조건분류`¶

`One-Hot Encoding` (전처리)¶

`.dat file`¶

`get_dummies` (one-hot encoding)¶

`split, strip`¶

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0