- 문제에 대한 나의 풀이(틀릴 수 있음)

import pandas as pd 
import numpy as np 
from plotnine import * 
import matplotlib.pyplot as plt 
import seaborn as sns

1

- (a)-(c)

- (b)-(d)

2

x = [1, 2, 3, 4]
y = [1, 2, 4, 3]
fig, axs = plt.subplots(2,2)
(ax1, ax2), (ax3, ax4) = axs
ax1.plot(x, y, 'o:r')
ax2.plot(x, y, 'Xb')
ax3.plot(x, y, 'xm')
ax4.plot(x, y, '.--k')
[<matplotlib.lines.Line2D at 0x25605a16100>]
fig

3

- 하니, 홍두깨, 고은애, 이창수

4

- 하니, 홍두깨, 고은애

5

df = pd.read_csv('https://raw.githubusercontent.com/guebin/2021DV/master/_notebooks/2021-10-25-FIFA22_official_data.csv')
df.head()
ID Name Age Photo Nationality Flag Overall Potential Club Club Logo ... SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Best Position Best Overall Rating Release Clause DefensiveAwareness
0 212198 Bruno Fernandes 26 https://cdn.sofifa.com/players/212/198/22_60.png Portugal https://cdn.sofifa.com/flags/pt.png 88 89 Manchester United https://cdn.sofifa.com/teams/11/30.png ... 65.0 12.0 14.0 15.0 8.0 14.0 CAM 88.0 €206.9M 72.0
1 209658 L. Goretzka 26 https://cdn.sofifa.com/players/209/658/22_60.png Germany https://cdn.sofifa.com/flags/de.png 87 88 FC Bayern München https://cdn.sofifa.com/teams/21/30.png ... 77.0 13.0 8.0 15.0 11.0 9.0 CM 87.0 €160.4M 74.0
2 176580 L. Suárez 34 https://cdn.sofifa.com/players/176/580/22_60.png Uruguay https://cdn.sofifa.com/flags/uy.png 88 88 Atlético de Madrid https://cdn.sofifa.com/teams/240/30.png ... 38.0 27.0 25.0 31.0 33.0 37.0 ST 88.0 €91.2M 42.0
3 192985 K. De Bruyne 30 https://cdn.sofifa.com/players/192/985/22_60.png Belgium https://cdn.sofifa.com/flags/be.png 91 91 Manchester City https://cdn.sofifa.com/teams/10/30.png ... 53.0 15.0 13.0 5.0 10.0 13.0 CM 91.0 €232.2M 68.0
4 224334 M. Acuña 29 https://cdn.sofifa.com/players/224/334/22_60.png Argentina https://cdn.sofifa.com/flags/ar.png 84 84 Sevilla FC https://cdn.sofifa.com/teams/481/30.png ... 82.0 8.0 14.0 13.0 13.0 14.0 LB 84.0 €77.7M 80.0

5 rows × 65 columns

(a)

df['Loaned From']
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
16705    NaN
16706    NaN
16707    NaN
16708    NaN
16709    NaN
Name: Loaned From, Length: 16710, dtype: object
df['Loaned From'].isnull().sum()
15578
df['Marking']
0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
16705     5.0
16706     NaN
16707     NaN
16708     NaN
16709    15.0
Name: Marking, Length: 16710, dtype: float64
df['Marking'].isnull().sum()
15818

- 결측치가 많다

(b)

df = df.drop(['Loaned From', 'Marking'], axis = 1)
df.isnull().sum().sum()
5404

(c)

df = df.dropna()
df.isnull().sum().sum()
0

- 5404개의 결측치가 제거되었다

(d)

def convert_currency(value):
    floatvalue = 0.0
    strvalue=""
    if "M" in value:
        strvalue=value.replace("M","").replace("€","")
        floatvalue=float(float(strvalue)*1000000)
    elif "K" in value:
        strvalue=value.replace("K","").replace("€","")
        floatvalue=float(float(strvalue)*1000)
    else:
        floatvalue=value.replace("€","")
    return floatvalue
df['Wage'] = list(map(convert_currency, df['Wage']))
df['Wage']
0        250000.0
1        140000.0
2        135000.0
3        350000.0
4         45000.0
           ...   
16703         650
16704         950
16706         550
16707         700
16708         500
Name: Wage, Length: 14398, dtype: object

(e)

df['Value'] = list(map(convert_currency, df['Value']))
df2 = df.groupby(by = 'Best Position').agg({'Value':np.mean})

- 신기한점 : 컴퓨터마다(?) 'Value'가 object 타입인지 float 타입인지가 다른것같다

- 나의 경우는 자료형을 변환하지 않아도 위와 같이 잘되는데 안되는 경우도 있나봄

df2
Value
Best Position
CAM 4.356162e+06
CB 3.038834e+06
CDM 3.539740e+06
CF 9.122222e+06
CM 5.630414e+06
GK 2.703686e+06
LB 3.051887e+06
LM 3.439977e+06
LW 6.443137e+06
LWB 3.451340e+06
RB 3.203283e+06
RM 2.550153e+06
RW 3.977832e+06
RWB 3.023522e+06
ST 3.295080e+06
df2 = df2.stack().reset_index()
df2 = df2.rename(columns={'level_1':'group1', 0:'mean(Value)'})
df2
Best Position group1 mean(Value)
0 CAM Value 4.356162e+06
1 CB Value 3.038834e+06
2 CDM Value 3.539740e+06
3 CF Value 9.122222e+06
4 CM Value 5.630414e+06
5 GK Value 2.703686e+06
6 LB Value 3.051887e+06
7 LM Value 3.439977e+06
8 LW Value 6.443137e+06
9 LWB Value 3.451340e+06
10 RB Value 3.203283e+06
11 RM Value 2.550153e+06
12 RW Value 3.977832e+06
13 RWB Value 3.023522e+06
14 ST Value 3.295080e+06
df2 = df2.sort_values(by = ['mean(Value)'], axis = 0, ascending = False)
df2 = df2.reset_index(drop = True)
df2
Best Position group1 mean(Value)
0 CF Value 9.122222e+06
1 LW Value 6.443137e+06
2 CM Value 5.630414e+06
3 CAM Value 4.356162e+06
4 RW Value 3.977832e+06
5 CDM Value 3.539740e+06
6 LWB Value 3.451340e+06
7 LM Value 3.439977e+06
8 ST Value 3.295080e+06
9 RB Value 3.203283e+06
10 LB Value 3.051887e+06
11 CB Value 3.038834e+06
12 RWB Value 3.023522e+06
13 GK Value 2.703686e+06
14 RM Value 2.550153e+06
cnt = 0
def z(x):
    global cnt
    cnt += 1
    
    if cnt <= 3:
        return 'True'
    else:
        return 'False'
df2['Highlight'] = list(map(z, df2['mean(Value)']))
df2
Best Position group1 mean(Value) Highlight
0 CF Value 9.122222e+06 True
1 LW Value 6.443137e+06 True
2 CM Value 5.630414e+06 True
3 CAM Value 4.356162e+06 False
4 RW Value 3.977832e+06 False
5 CDM Value 3.539740e+06 False
6 LWB Value 3.451340e+06 False
7 LM Value 3.439977e+06 False
8 ST Value 3.295080e+06 False
9 RB Value 3.203283e+06 False
10 LB Value 3.051887e+06 False
11 CB Value 3.038834e+06 False
12 RWB Value 3.023522e+06 False
13 GK Value 2.703686e+06 False
14 RM Value 2.550153e+06 False
ggplot(df2) + geom_bar(aes(x = 'Best Position', y = 'mean(Value)', fill = 'Highlight'), stat = 'identity')
<ggplot: (160530408406)>

(f)

- 문제는 alpha = 0.5로 하라고 했는데 시각화 예시는 alpha = 0.2임...

ggplot(df) + geom_point(aes(x = 'Dribbling', y = 'SlidingTackle', color = 'Age'), alpha = 0.5, size = 0.5) + facet_wrap('Best Position')
<ggplot: (160533279925)>

(g)

- 하니, 홍두깨, 고은애

(h)

df3 = df.loc[lambda df:(df['Best Position'] =='CAM') | (df['Best Position'] =='CB')]

- 이유는 모르겠는데 query로는 안되서 위와 같이 했음

- 문제에는 CB가 아니라 CM인데 plot은 CB로 되어있어서 CB로 했음 + (i)문제에서도 CB라고 언급함

ggplot(df3) + geom_point(aes(x = 'Dribbling', y = 'SlidingTackle', color = 'Age', size = 'Value'), alpha = 0.2 ) + facet_wrap('Best Position')
<ggplot: (160531860947)>

(i)

- 홍두깨

6

x = [0, 1, 4, 5]
y = [0, 2, 3, 5]
x2 = [5, 4.1, 1, 0]
y2 = [5, 3, 0.5, 0]
df_ = pd.DataFrame({'x':x ,'y':y})
df_['course'] = 'A'
df_
x y course
0 0 0 A
1 1 2 A
2 4 3 A
3 5 5 A
df2_ = pd.DataFrame({'x':x2 ,'y':y2})
df2_['course'] = 'B'
df2_
x y course
0 5.0 5.0 B
1 4.1 3.0 B
2 1.0 0.5 B
3 0.0 0.0 B
stamina = 100  ## 집에서 출발시 체력
x_ = 0
y_ = 0

def f(x,y):
    global x_
    global y_
    global stamina
    
    stamina = stamina - (((x-x_)**2 + (y-y_)**2)**0.5) ## 두 점 사이의 거리
    x_ = x
    y_ = y
    
    return stamina
df_['stamina'] = list(map(f, df_['x'], df_['y']))
df_
x y course stamina
0 0 0 A 100.000000
1 1 2 A 97.763932
2 4 3 A 94.601654
3 5 5 A 92.365586
stamina -= 70 ## A지점 도착지점에서 B지점 출발지점까지 가는데 70이 소모됨
df2_['stamina'] = list(map(f, df2_['x'], df2_['y']))
df2_
x y course stamina
0 5.0 5.0 B 22.365586
1 4.1 3.0 B 20.172415
2 1.0 0.5 B 16.189954
3 0.0 0.0 B 15.071920
df3_ = pd.concat([df_, df2_]) ## 두 데이터프레임 합치기
df3_
x y course stamina
0 0.0 0.0 A 100.000000
1 1.0 2.0 A 97.763932
2 4.0 3.0 A 94.601654
3 5.0 5.0 A 92.365586
0 5.0 5.0 B 22.365586
1 4.1 3.0 B 20.172415
2 1.0 0.5 B 16.189954
3 0.0 0.0 B 15.071920
ggplot(df3_) + geom_point(aes(x = 'x', y = 'y')) + geom_line(aes(x = 'x', y = 'y', size = 'stamina', color = 'course'), alpha = 0.5)
<ggplot: (160530609351)>

7

- 문제 풀고나니 느낀건데 국어를 너무 못함 ---> 문제를 제대로 파악을 못함

p = ['A'] * 2 + ['B'] * 2
s1 = [10, 20, 30, 4]
s2 = [7, 19, 23, 4]
s3 = ['one', 'two'] * 2
data = pd.DataFrame({'person':p, 'count':s1, 'goal':s2, 'season':s3})
data
person count goal season
0 A 10 7 one
1 A 20 19 two
2 B 30 23 one
3 B 4 4 two
data['prob'] = data['goal'] / data['count']
data
person count goal season prob
0 A 10 7 one 0.700000
1 A 20 19 two 0.950000
2 B 30 23 one 0.766667
3 B 4 4 two 1.000000
data2 = data.groupby(['person']).agg({'count':np.sum, 'goal':np.sum}).reset_index().rename(columns = {'count':'sum', 'goal':'goal_sum'})
data2
person sum goal_sum
0 A 30 26
1 B 34 27
td = pd.merge(data, data2)
td
person count goal season prob sum goal_sum
0 A 10 7 one 0.700000 30 26
1 A 20 19 two 0.950000 30 26
2 B 30 23 one 0.766667 34 27
3 B 4 4 two 1.000000 34 27
td['prob2'] = (td['goal_sum'] / td['sum']) / 2 ## 시즌이 2개라 값이 2번 들어가서 2로 나눠줌
td
person count goal season prob sum goal_sum prob2
0 A 10 7 one 0.700000 30 26 0.433333
1 A 20 19 two 0.950000 30 26 0.433333
2 B 30 23 one 0.766667 34 27 0.397059
3 B 4 4 two 1.000000 34 27 0.397059
ggplot(data)\
+ geom_bar(aes(x = 'person', y = 'prob', fill = 'person'), stat = 'identity')\
+ facet_wrap('season')
<ggplot: (160530787118)>

- 시즌별 성공확률은 B가 더 높다

- 하지만 전체 성공확률을 보면?

ggplot(td)\
+ geom_bar(aes(x = 'person', y = 'prob2', fill = 'person'), stat = 'identity')
<ggplot: (160532310432)>

- A가 더 높은것을 확인할수있다

- B가 100%확률을 기록했을 때 전체횟수가 모종의 이유(부상 치료 등)로 인해 4회밖에 되지않아 전체에 끼치는 영향력이 작아져 위와같은 결과가 발생했다