728x90

728x170

빅데이터분석기사 실기 기출 복원 문제 - 제1유형

들어가며

빅데이터분석기사 실기 기출 제1유형 복원 문제를 정리해본다.
2회 ~ 6회까지 분량이며, 2회부터 5회까지의 제1유형은 단답형 문제였으나 6회 시험부터 작업형 문제로 바뀌었고, 이에 맞게 문제의 내용이 수정되었다.
문제 해결 방법을 쭉 읽어 보는 용도로 정리해보았다.

문제

2021년 2회

문제 1

BostonHousing 데이터
crim 항목의 상위에서 10번째 값(즉, 상위 10개의 값 중에서 가장 적은 값)으로 상위 10개의 값을 변환하고, age가 80 이상인 값에 대하여 `crim의 평균 구하기
소수점 3째 자리에서 반올림해서 소수점 2째 자리로 출력하기

 import numpy as np
import pandas as pd
 
df = pd.read_csv('dataset.csv')
 
# (1) 상위 10개 값 중에서 가장 적은 값으로 상위 10개의 값 변환하기
top10 = df['crim'].sort_values(ascending=False).head(10)   # 내림차순 정렬 후 상위 10개만 뽑아내기
print(top10)
 
380    88.9762
418    73.5341
405    67.9208
410    51.1358
414    45.7461
404    41.5292
398    38.3518
427    37.6619
413    28.6558
417    25.9406
Name: crim, dtype: float64
 
tenth = top10.iloc[9]
df['crim'] = np.where(df['crim'] >= tenth, tenth, df['crim'])   # tenth 변수보다 크거나 같으면 tenth 값으로 변환
 
# (2) age가 80 이상인 값에 대하여 crim의 평균 구하기
over80 = df[df['age'] >= 80]
print(over80['age'].describe())
 
count    240.000000
mean      93.230833
std        5.734098
min       80.300000
25%       88.750000
50%       94.500000
75%       98.200000
max      100.000000
Name: age, dtype: float64
 
print(round(over80['crim'].mean(), 2))   # 병균을 구하고 반올림해서 소수점 2째자리까지 출력
 
5.76

문제 2

housing 데이터
데이터의 첫 번째 행부터 순서대로 80%까지의 데이터를 훈련 데이터로 추출 후, total_bedrooms 변수의 결측값(NA)을 total_bedrooms 변수의 중앙값으로 대체하고, 대체 전의 total_bedrooms 변수 표준편차 값과 대체 후의 total_bedrooms 변수 표준편차 값의 차이의 절댓값 구하기
소수점 3째 자리에서 반올림해서 소수점 2째 자리로 출력하기

 import numpy as np
import pandas as pd
 
housing = pd.read_csv('dataset.csv')
 
# (1) 첫 번째 행부터 순서대로 80%까지의 데이터를 추출
nrow = int(len(housing) * 0.8) 
df = housing.iloc[:nrow, :]
df = pd.DataFrame(df)
 
a = df['total_bedrooms'].std()  # 대체 전의 표준편차
 
# (2) 결측값(NA)를 중앙값으로 대체
median_train = df['total_bedrooms'].median()    # 중앙값
df['total_bedrooms'] = df['total_bedrooms'].fillna(median_train)
 
b = df['total_bedrooms'].std()   # 대체 후의 표준편차
 
# (3) 대체 전과 대체 후의 total_bedrooms 변수 표준편차 값 차이의 절댓값 구하기
print(round(np.abs(a - b), 2))
 
1.98

문제 3

Insurance 데이터 세트
Charges 항목에서 이상값의 합 구하기
이상값은 평균에서 1.5 표준편차 이상인 값으로 하고, 소수점 이하는 버리고 정수로 출력하기

 import pandas as pd
import numpy as np
 
df = pd.read_csv('dataset.csv')
 
# 이상값 구하기
upper = np.mean(df['charges']) + 1.5 * np.std(df['charges'])
lower = np.mean(df['charges']) - 1.5 * np.std(df['charges'])
 
range = (df['charges'] >= upper) | (df['charges'] <= lower)   # or 연산자 대신 | 연산자 사용
 
# 이상값의 합 구하기
result = df[range].sum()
print(result)
 
age                                                      6192
sex         malemalefemalemalemalemalemalemalemalemalemale...
bmi                                                  5468.835
children                                                  177
smoker      yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...
region      southeastsouthwestnortheastsouthwestsouthwests...
charges                                         6421430.02067
dtype: object
 
# 소수점 이하는 버리고 정수로 출력하기
print(int(result['charges']))
 
6421430

2021년 3회

문제 1

housing 데이터
결측값이 있는 모든 행을 제거한 후, 데이터의 순서대로 상위 70%의 데이터를 학습 데이터로 만들고, 훈련 데이터의 housing_median_age 컬럼의 제1사분위수(Q1) 구하기
소수점 이하는 버리고 정수로 출력하기

 import numpy as np
import pandas as pd
 
housing = pd.read_csv("dataset.csv")
print(housing.info())
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
 
# (1) 결측값이 있는 모든 행을 제거
## 결측값이 있는 행의 개수 출력
step1_before = sum(housing.isnull().any(axis=1))
print(step1_before)
 
207
 
## 결측값 제거 후, 결측값이 제거되었는지 확인
housing = housing.dropna()   # 결측값이 있는 모든 행 제거
step1_after = sum(housing.isnull().any(axis=1))
print(step1_after)
 
0
 
# (2) 데이터의 순서대로 상위 70%의 데이터를 학습 데이터로 만들기
nrow = int(len(housing) * 0.7)
df = housing.iloc[:nrow , :]
 
# (3) 훈련 데이터의 housing_median_age 컬럼의 Q1 구하기
step2 = int(np.quantile(df['housing_median_age'], q=0.25))
print(step2)
 
19

문제 2

타이타닉 데이터
데이터가 없는 것을 결측값으로 하여 결측값 비율을 구하고 결측값 비율이 가장 높은 컬럼의 이름 구하기

 import pandas as pd
 
titanic = pd.read_csv("dataset.csv")
 
# (1) 데이터가 없는 것을 결측값으로 하여 결측값 비율 구하기
cs = titanic.isna().sum() / len(titanic)    # 컬럼별 결측값의 비율=컬럼별 결측값 개수 / 전체 개수) 계산
print(cs)
 
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Embarked       0.002245
dtype: float64
 
# (2) 결측값 비율이 가장 높은 컬럼 이름 구하기
cs = pd.DataFrame(cs)
ds = cs.sort_values(by=0, ascending=False)   # 내림차순 정렬 (by=0 : 첫 번째 열 기준 정렬)
print(ds)
 
                    0
Age          0.198653
Embarked     0.002245
PassengerId  0.000000
Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
 
print(ds.index[0])
 
Age

문제 3

연도별 각 국가의 결핵 감염에 대한 유병률 데이터
country, year, new_sp 컬럼에 결측값이 있을 경우 제거하고, 2000년도에 국가별 결핵 발생 건수에 대한 평균 결핵 발생 건수를 구하고, 2000년도의 결핵 발생 건수가 2000년도 국가별 결핵 발생 건수에에 대한 평균 결핵 발생 건수보다 결핵 발생 건수가 높은 국가의 개수 구하기
국가별 결핵 발생 건수에 대한 평균 결핵 발생 건수를 출력할 때 소수점 3째 자리에서 반올림해서 소수점 2째자리로 출력하기

 import pandas as pd
 
who = pd.read_csv('dataset.csv')
 
# (1) 대상 컬럼 선택 후, 결측값이 있을 경우 제거하기
who = who[["year", "country", "new_sp"]]   # new_sp : 결핵 발생 건수
who = who.dropna()     # 결측값 제거
print(who)
 
      year      country   new_sp
17    1997  Afghanistan    618.0
18    1998  Afghanistan   1833.0
19    1999  Afghanistan   1669.0
20    2000  Afghanistan   2892.0
21    2001  Afghanistan   4639.0
...    ...          ...      ...
8694  2008     Zimbabwe   9830.0
8695  2009     Zimbabwe  10195.0
8696  2010     Zimbabwe  11654.0
8697  2011     Zimbabwe  12596.0
8698  2012     Zimbabwe  12163.0
 
[3902 rows x 3 columns]
 
# (2) 2000년도에 국가별 결핵 발생 건수에 대한 평균 결핵 발생 건수 구하기
crit1 = who["year"] == 2000
who2000 = who[crit1]
 
mean_cnt = np.mean(who2000["new_sp"])
print(round(mean_cnt, 2))    # 소수점 둘째자리까지 출력
 
7865.34
 
# (3) 2000년도의 결핵 발생 건수가 2000년도 국가별 결핵 발생 건수에 대한 평균 결핵 발생 건수보다 결핵 발생 건수가 높은 국가의 개수 구하기
crit2 = who2000["new_sp"] >= mean_cnt
country = who2000[crit2]
print(len(country))
 
38

2022년 4회

문제 1

순서대로 처리하고 결과를 정수로 출력하기
- (1) y 변수의 1사분위와 3사분위 값 구하기
- (2) 3사분위수에서 1사분위수를 뺀 값 구하기
- (3) 소수점 이하는 버리고 정수로 출력하기

 import pandas as pd
 
df = pd.read_csv("dataset.csv")
print(df.describe())
 
              x          y
count  10.00000  10.000000
mean    5.50000  58.500000
std     3.02765  24.712795
min     1.00000  15.000000
25%     3.25000  40.250000
50%     5.50000  60.000000
75%     7.75000  77.000000
max    10.00000  91.000000
 
# (1) y 변수의 1사분위와 3사분위 값 구하기
q1 = df['y'].quantile(0.25)   # 1사분위 값
q3 = df['y'].quantile(0.75)   # 3사분위 값
 
# (2) 3사분위수에서 1사분위수를 뺀 값 구하기
print(int(abs(q3 - q1)))   # (3) 소수점 이하는 버리고 정수로 출력하기
 
36

문제 2

페이스북 평가 데이터
'좋아요' 수(num_loves)와 '놀랐어요' 수(num_wows)를 긍정의 평가로 보고 전체 반응(num_reactions)에서 긍정인 비율이 0.4보다 크고 0.5보다 작은 비디오 개수 구하기

 import pandas as pd
 
f = pd.read_csv('dataset.csv')
 
# (1) 긍정 평가 비율 구하기 (긍정 = 좋아요 + 놀랐어요)
f['pos'] = (f['num_loves'] + f['num_wows']) / f['num_reactions']
 
# (2) 긍정인 비율이 0.4 보다 크고 0.5 보다 작은 비디오 개수 구하기
range = (f['pos'] > 0.4) & (f['pos'] < 0.5)    # and 연산자 대신 & 연산자 사용 (비트 연산자)
result = f[range]
 
print(len(result))
 
90

문제 3

넷플릭스에서 상영된 작품들의 목록 데이터
2018년 1월에 넷플릭스에서 추가한 작품 중 United Kingdom에서 단독으로 제작된 작품의 개수 구하기

 import pandas as pd
 
netflix = pd.read_csv('dataset.csv')
 
# 추가된 날짜 확인하기
print(netflix['date_added'].head())
 
0    September 25, 2021
1    September 24, 2021
2    September 24, 2021
3    September 24, 2021
4    September 24, 2021
Name: date_added, dtype: object
 
# 2018년 1월 영국에서 단독으로 추가된 작품 고르기
netflix['date_added'] = pd.to_datetime(netflix['date_added'], format="%B %d, %Y")    # January 1, 2023
 
print(netflix['date_added'])
 
0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date_added, Length: 8807, dtype: datetime64[ns]
 
crit1 = netflix['country'] == "United Kingdom"    # United Kingdom
crit2 = netflix['date_added'].dt.year == 2018    # 2018년
crit3 = netflix['date_added'].dt.month == 1     # 1월
 
result = netflix[crit1 & crit2 & crit3]   # 기준에 맞는 행만 저장
print(len(result))
 
6

2022년 5회

문제 1

종량제 쓰레기 데이터
다음 기준에 따른 데이터를 추출하고 평균 가격을 제출 형식에 따라 제출하기
- 종량제봉투종류 : 규격봉투
- 종량제봉투용도 : 음식물쓰레기
- 종량제봉투용량 : 2L
- (가격이 0인 것은 구매하지 않은 것으로 평균 계산할 때 제외한다.)
소수점 첫 번째 자리에서 반올림해서 정수형으로 출력하기

 import pandas as pd
 
waste = pd.read_csv('dataset.csv', encoding='euc-kr')
print(waste.info())
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746 entries, 0 to 745
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   시도명        746 non-null    object
 1   시군구명       746 non-null    object
 2   종량제봉투종류    746 non-null    object
 3   종량제봉투처리방식  746 non-null    object
 4   종량제봉투용도    746 non-null    object
 5   종량제봉투사용대상  746 non-null    object
 6   1L가격       746 non-null    int64 
 7   1.5L가격     746 non-null    int64 
 8   2L가격       746 non-null    int64 
 9   2.5L가격     746 non-null    int64 
 10  3L가격       746 non-null    int64 
 11  5L가격       746 non-null    int64 
 12  10L가격      746 non-null    int64 
 13  20L가격      746 non-null    int64 
 14  30L가격      746 non-null    int64 
 15  50L가격      746 non-null    int64 
 16  60L가격      746 non-null    int64 
 17  75L가격      746 non-null    int64 
 18  100L가격     746 non-null    int64 
 19  120L가격     746 non-null    int64 
...
memory usage: 151.7+ KB
None
 
# 기준에 따른 데이터 추출하기
crit1 = waste['종량제봉투종류'] == '규격봉투'
crit2 = waste['종량제봉투용도'] == '음식물쓰레기'
crit3 = waste['2L가격'] != 0   # 가격이 0인 것(구매하지 않은 것)을 제외하기
 
df = waste[crit1 & crit2 & crit3]
 
# 평균 가격 구하기
mean_price = df['2L가격'].mean()
print(int(mean_price))    # 정수값으로 출력하기
 
120

문제 2

Body 데이터
다음 기준에 의해 BMI를 계산하여 분류하고, 정상 체중 범위의 구간에 있는 인원과 위험 체중 범위의 구간에 있는 인원의 차이를 절댓값으로 구하기
- BMI = Weight / Height² (Weight 단위 : kg, Height 단위 : m)
- 저체중 : 18.5 < BMI
- 정상체중 : 18.5 ≤ BMI < 23
- 위험체중 : 23 ≤ BMI < 25
- 비만 : 25 ≤ BMI
BMI 계산시 단위에 유의하고, 소수점 첫 번째 자리에서 반올림해서 정수로 출력하기

 import pandas as pd
 
body = pd.read_csv('dataset.csv')
 
# BMI를 계산한 후 추가하기
body['bmi'] = body['Weight'] / ((body['Height'] / 100) ** 2)
 
# 체중 범위 구하기
## 정상 체중
crit1 = (body['bmi'] >= 18.5) & (body['bmi'] < 23)
normal = body[crit1]
cnt_normal = len(normal)
print(cnt_normal)
 
47
 
## 위험 체중
crit2 = (body['bmi'] >= 23) & (body['bmi'] < 25)
danger = body[crit2]
cnt_danger = len(danger)
print(cnt_danger)
 
19
 
# 인원의 차이를 절댓값으로 구하기
result = int(abs(cnt_normal - cnt_danger))
print(result)
 
28

문제 3

임의의 데이터
순전입학생수가 가장 큰 학교의 전체 학생수를 출력하기
- 순전입학생수 = 총전입학생수 - 총전출학생수

 import pandas as pd
 
df = pd.read_csv('dataset.csv', encoding='euc-kr')
print(df.info())
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6287 entries, 0 to 6286
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   시도교육청       6287 non-null   object 
 1   지역교육청       6287 non-null   object 
 2   지역          6280 non-null   object 
 3   정보공시 학교코드   6287 non-null   object 
 4   학교명         6287 non-null   object 
 5   학교급코드       6287 non-null   int64  
 6   설립구분        6287 non-null   object 
 7   제외여부        6287 non-null   object 
 8   제외사유        0 non-null      float64
 9   1학년 전입학생수   6287 non-null   int64  
 10  1학년 전출학생수   6287 non-null   int64  
 11  1학년 전체학생수   6287 non-null   int64  
 12  2학년 전입학생수   6287 non-null   int64  
 13  2학년 전출학생수   6287 non-null   int64  
 14  2학년 전체학생수   6287 non-null   int64  
 15  3학년 전입학생수   6287 non-null   int64  
 16  3학년 전출학생수   6287 non-null   int64  
 17  3학년 전체학생수   6287 non-null   int64  
 18  4학년 전입학생수   6287 non-null   int64  
 19  4학년 전출학생수   6287 non-null   int64  
...
memory usage: 1.4+ MB
None
 
# <순전입학생수> 구하기
df['순전입학생수'] = df['전입학생수(계)'] - df['전출학생수(계)']
df = df.sort_values(by="순전입학생수", ascending=False)   # <순전입학생수> 열 기준으로 내림차순 정렬하기
 
# 최댓값 뽑아내기 (순전입학생수가 가장)
result = df['전체학생수(계)'].iloc[0]   # 첫 번째 행 값 가져오기
print(result)
 
956

2023년 6회

문제 1

출동소방서별로 주민으로부터 연락 받은 신고일시와 출동한 출동일시를 기록한 데이터
출동소방서별 신고일시로부터 출동일시까지의 연도별 월평균을 구하고, 가장 늦게 출동한 출동소방서의 월평균 시간을 분단위로 제출 형식에 맞게 제출하기
시간은 30초 단위로 반올림하기

 신고일시 YYYY-MM-DD hh:mm
출동일시 YYYY-MM-DD hh:mm
출동소방서

 import pandas as pd
from datetime import datetime
 
df = pd.read_csv('dataset.csv')
 
# 각 컬럼에 저장
df['call_time'] = pd.to_datetime(df['신고일시'])   
df['arrive_time'] = pd.to_datetime(df['출동일시'])
df['fire_station'] = df['출동소방서']
 
# 시간 차이 구하기
## .dt 속성은 datetime 형식의 열에 대해 날짜와 시간 구성 요소에 접근할 수 있게 해준다.
df['diff_time'] = (df['arrive_time'] - df['call_time']).dt.total_seconds()   # 차이를 초로 계산
 
# 그룹화 한 후 내림차순 정렬하기
df = df.groupby([df['fire_station'], df['call_time'].dt.year, df['call_time'].dt.month]).mean('diff_time')    # diff_time 열에 대해서만 평균 계산
df = df.sort_values('diff_time', ascending=False)   # diff_time 열 기준으로 내림차순 정렬
 
res_date = df['diff_time'].head(1)    # 첫 번째 행만 선택 (diff_time이 제일 큰 항목)
 
print(res_date)
 
fire_station  call_time  call_time
대화119안전센터     2018       9            3840.0
초월119안전센터     2018       7            2925.0
                         6            2865.0
경기도소방재난본부     2018       8            2220.0
                         10           1620.0
                                       ...  
용문119안전센터     2018       2              45.0
설악119안전센터     2018       6              45.0
백암119안전센터     2018       12             40.0
영북119안전센터     2018       3               0.0
가평119안전센터     2018       6               0.0
Name: diff_time, Length: 623, dtype: float64
 
print(res_date.iloc[0])
 
3840.0
 
res_num = float(res_date.iloc[0]) / 60    # 초 단위로 저장된 res_date를 실수형으로 변환 후, 분단위인 60으로 나눔.
result = round(res_num)    # 반올림
 
print(result)
 
64

문제 2

초등학교 학년별 학생 수와 교사 수를 기록한 데이터
교사 1인당 학생 수가 가장 많은 학교를 선정하고, 선정된 학교의 교사 수를 제출 형식에 맞게 제출하기
- 제출 형식 : 정수
학교명 중복은 없고 단일 학교의 학생 수, 교사 수 데이터만 있는 것으로 함.

 import pandas as pd
 
df = pd.read_csv('dataset.csv')
 
# df 변수의 구조 파악
print(df.info())
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6309 entries, 0 to 6308
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   school_name  6309 non-null   object
 1   student_1    6309 non-null   int64 
 2   student_2    6309 non-null   int64 
 3   student_3    6309 non-null   int64 
 4   student_4    6309 non-null   int64 
 5   student_5    6309 non-null   int64 
 6   student_6    6309 non-null   int64 
 7   teacher      6309 non-null   int64 
dtypes: int64(7), object(1)
memory usage: 394.4+ KB
None
 
# df 변수의 요약 통계량 확인
print(df.describe())
 
         student_1    student_2    student_3    student_4    student_5  \
count  6309.000000  6309.000000  6309.000000  6309.000000  6309.000000   
mean     67.578697    66.378982    66.438897    73.728642    71.712474   
std      67.752782    65.398715    64.544496    70.324875    67.134313   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000    11.000000    11.000000   
50%      50.000000    49.000000    51.000000    57.000000    56.000000   
75%     104.000000   101.000000   103.000000   115.000000   112.000000   
max     470.000000   390.000000   401.000000   420.000000   393.000000   
 
         student_6      teacher  
count  6309.000000  6309.000000  
mean     71.538120    23.925503  
std      66.448847    17.191077  
min       0.000000     0.000000  
25%      11.000000     8.000000  
50%      58.000000    20.000000  
75%     112.000000    35.000000  
max     428.000000    95.000000  
 
# 상위 5개 데이터 확인
print(df.head())
 
       school_name  student_1  student_2  student_3  student_4  student_5  \
0    서울교육대학교부설초등학교         95         95         95        104        108   
1  서울대학교사범대학부설초등학교         97        100        102        102        102   
2         서울개일초등학교        183        163        176        186        177   
3         서울구룡초등학교        109        115        115        131        133   
4         서울논현초등학교         34         47         34         46         46   
 
   student_6  teacher  
0        110       31  
1        104       36  
2        193       42  
3        132       38  
4         46       18  
 
# 교사 1인당 학생 수(학생 수 합계 / 교사 수) 구한 후 내림차순 정렬하기
df["tch_std"] = (df["student_1"] + df["student_2"] + df["student_3"] + df["student_4"] + df["student_5"] + df["student_6"]) / df["teacher"]
 
df_sort = df.sort_values("tch_std", ascending=False)
df_sort["tch_std"].head()   # 내림차순 정렬된 것 확인
 
1044    36.500000
3387    34.125000
1464    32.333333
360     32.041667
687     31.652174
Name: tch_std, dtype: float64
 
# 제일 상단 데이터를 추출하고 변수에 저장
result = df_sort["teacher"].head(1)
print(result)
 
1044    6
Name: teacher, dtype: int64
 
# 마지막 컬럼 1개만 추출하여 변수에 저장
result = result.iloc[-1]
print(result)
 
6

문제 3

월별 범죄를 기록한 데이터
연도별 월평균 범죄 건수를 구하고, 가장 범죄가 많이 발생한 연도의 연평균 범죄 건수 구하기
파이썬의 경우, CSV 파일을 읽을 때 index_col = 0 옵션을 적용하기

 import pandas as pd
 
df = pd.read_csv('dataset.csv')
 
# df 변수의 구조 파악
print(df.info())
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   년월      84 non-null     object
 1   강력범     84 non-null     int64 
 2   절도범     84 non-null     int64 
 3   폭력범     84 non-null     int64 
 4   지능범     84 non-null     int64 
 5   풍속범     84 non-null     int64 
 6   기타형사범   84 non-null     int64 
dtypes: int64(6), object(1)
memory usage: 4.7+ KB
None
 
# df 변수의 요약 통계량 확인
print(df.describe())
 
              강력범          절도범          폭력범          지능범          풍속범  \
count   84.000000    84.000000    84.000000    84.000000    84.000000   
mean   446.285714  3840.976190  5931.738095  5659.059524   236.666667   
std    143.197780  1359.567144   986.131802   906.075429   161.785767   
min     22.000000    91.000000   263.000000   256.000000     8.000000   
25%    340.500000  2653.500000  5504.750000  5265.000000   149.750000   
50%    420.000000  3924.500000  6034.000000  5831.500000   190.500000   
75%    559.500000  4765.250000  6501.250000  6174.500000   265.500000   
max    724.000000  7317.000000  7849.000000  7115.000000  1216.000000   
 
             기타형사범  
count    84.000000  
mean   1405.666667  
std     287.946796  
min      44.000000  
25%    1257.000000  
50%    1408.000000  
75%    1577.250000  
max    2099.000000  
 
# 상위 5개 데이터 확인
print(df.head())
 
        년월  강력범   절도범   폭력범   지능범  풍속범  기타형사범
0  2007-01  291  2608  6494  5283  242   1126
1  2007-02  252  2403  4613  4270  158    880
2  2007-03  292  2273  6242  5110  196   1119
3  2007-04  306  2794  6813  5492  250   1311
4  2007-05  298  3469  7221  5432  192   1412
 
df.columns = ["ym", "a", "b", "c", "d", "e", "f"]   # 컬럼명을 사용하기 편하게 변경
df["crim_sum"] = df["a"] + df["b"] + df["c"] + df["d"] + df["e"] + df["f"]   # 범죄수 합계 구하기
df["year"] = df["ym"].str[:4]   # 연도 부분만 추출 
 
# 연도별로 그룹화하여 crim_sum 데이터에 대한 합계를 구한 후, 내림차순으로 정렬하여 변수에 저장
df1 = df.groupby(by="year", as_index=False).sum("crim_sum").sort_values("crim_sum", ascending=False)
print(df1.head())
 
   year     a      b      c      d     e      f  crim_sum
6  2013  6276  61585  65422  76541  1562  20565    231951
5  2012  6023  61329  70623  72238  1942  19345    231500
4  2011  6905  54294  72044  71252  2377  16484    223356
2  2009  4495  37175  73069  72262  6203  16864    210068
3  2010  6628  49382  68798  66368  2457  15059    208692
 
# 2013년의 범죄수 총합(crim_sum)이 제일 크므로 해당 데이터만 추출하여 변수에 저장
crit = df["year"] == "2013"
df2 = df[crit]
print(df2.head())
 
         ym    a     b     c     d    e     f  crim_sum  year
72  2013-01  396  4858  4990  6336  126  1631     18337  2013
73  2013-02  346  4382  4192  4900  115  1259     15194  2013
74  2013-03  444  4266  4936  5829  129  1495     17099  2013
75  2013-04  546  4915  5363  6499  125  1748     19196  2013
76  2013-05  558  5203  5834  7029  133  1662     20419  2013
 
res_mean = df2["crim_sum"].mean()
result = round(res_mean)    # 반올림해서 소수 0번째 자리까지 구한 값을 저장
 
print(result)
 
19329

728x90

그리드형(광고전용)

저작자표시 비영리 변경금지

'Certificate > BDAE' 카테고리의 다른 글

[빅데이터분석기사 실기] 기출 복원 문제 - 제3유형 (1)	2024.06.05
[빅데이터분석기사 실기] 기출 복원 문제 - 제2유형 (0)	2024.06.05
빅데이터분석기사 시험 개요 (0)	2022.07.11

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

[빅데이터분석기사 실기] 기출 복원 문제 - 제1유형

빅데이터분석기사 실기 기출 복원 문제 - 제1유형

들어가며

문제

2021년 2회

문제 1

문제 2

문제 3

2021년 3회

문제 1

문제 2

문제 3

2022년 4회

문제 1

문제 2

문제 3

2022년 5회

문제 1

문제 2

문제 3

2023년 6회

문제 1

문제 2

문제 3

'Certificate > BDAE' 카테고리의 다른 글

✏️ 카테고리

🏷️ 태그 목록

🗒️ 최근에 올라온 글

⭐ 인기 카테고리

📅 달력

📦 글 보관함

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역

	import numpy as np
	import pandas as pd

	df = pd.read_csv('dataset.csv')

	# (1) 상위 10개 값 중에서 가장 적은 값으로 상위 10개의 값 변환하기
	top10 = df['crim'].sort_values(ascending=False).head(10) # 내림차순 정렬 후 상위 10개만 뽑아내기
	print(top10)

	380 88.9762
	418 73.5341
	405 67.9208
	410 51.1358
	414 45.7461
	404 41.5292
	398 38.3518
	427 37.6619
	413 28.6558
	417 25.9406
	Name: crim, dtype: float64

	tenth = top10.iloc[9]
	df['crim'] = np.where(df['crim'] >= tenth, tenth, df['crim']) # tenth 변수보다 크거나 같으면 tenth 값으로 변환

	# (2) age가 80 이상인 값에 대하여 crim의 평균 구하기
	over80 = df[df['age'] >= 80]
	print(over80['age'].describe())

	count 240.000000
	mean 93.230833
	std 5.734098
	min 80.300000
	25% 88.750000
	50% 94.500000
	75% 98.200000
	max 100.000000
	Name: age, dtype: float64

	print(round(over80['crim'].mean(), 2)) # 병균을 구하고 반올림해서 소수점 2째자리까지 출력

	5.76

	import numpy as np
	import pandas as pd

	housing = pd.read_csv('dataset.csv')

	# (1) 첫 번째 행부터 순서대로 80%까지의 데이터를 추출
	nrow = int(len(housing) * 0.8)
	df = housing.iloc[:nrow, :]
	df = pd.DataFrame(df)

	a = df['total_bedrooms'].std() # 대체 전의 표준편차

	# (2) 결측값(NA)를 중앙값으로 대체
	median_train = df['total_bedrooms'].median() # 중앙값
	df['total_bedrooms'] = df['total_bedrooms'].fillna(median_train)

	b = df['total_bedrooms'].std() # 대체 후의 표준편차

	# (3) 대체 전과 대체 후의 total_bedrooms 변수 표준편차 값 차이의 절댓값 구하기
	print(round(np.abs(a - b), 2))

	1.98

	import numpy as np
	import pandas as pd

	housing = pd.read_csv("dataset.csv")
	print(housing.info())

	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 20640 entries, 0 to 20639
	Data columns (total 10 columns):
	# Column Non-Null Count Dtype
	--- ------ -------------- -----
	0 longitude 20640 non-null float64
	1 latitude 20640 non-null float64
	2 housing_median_age 20640 non-null float64
	3 total_rooms 20640 non-null float64
	4 total_bedrooms 20433 non-null float64
	5 population 20640 non-null float64
	6 households 20640 non-null float64
	7 median_income 20640 non-null float64
	8 median_house_value 20640 non-null float64
	9 ocean_proximity 20640 non-null object
	dtypes: float64(9), object(1)
	memory usage: 1.6+ MB
	None

	# (1) 결측값이 있는 모든 행을 제거
	## 결측값이 있는 행의 개수 출력
	step1_before = sum(housing.isnull().any(axis=1))
	print(step1_before)

	207

	## 결측값 제거 후, 결측값이 제거되었는지 확인
	housing = housing.dropna() # 결측값이 있는 모든 행 제거
	step1_after = sum(housing.isnull().any(axis=1))
	print(step1_after)

	0

	# (2) 데이터의 순서대로 상위 70%의 데이터를 학습 데이터로 만들기
	nrow = int(len(housing) * 0.7)
	df = housing.iloc[:nrow , :]

	# (3) 훈련 데이터의 housing_median_age 컬럼의 Q1 구하기
	step2 = int(np.quantile(df['housing_median_age'], q=0.25))
	print(step2)

	19

	import pandas as pd

	titanic = pd.read_csv("dataset.csv")

	# (1) 데이터가 없는 것을 결측값으로 하여 결측값 비율 구하기
	cs = titanic.isna().sum() / len(titanic) # 컬럼별 결측값의 비율=컬럼별 결측값 개수 / 전체 개수) 계산
	print(cs)

	PassengerId 0.000000
	Survived 0.000000
	Pclass 0.000000
	Name 0.000000
	Sex 0.000000
	Age 0.198653
	SibSp 0.000000
	Parch 0.000000
	Ticket 0.000000
	Fare 0.000000
	Embarked 0.002245
	dtype: float64

	# (2) 결측값 비율이 가장 높은 컬럼 이름 구하기
	cs = pd.DataFrame(cs)
	ds = cs.sort_values(by=0, ascending=False) # 내림차순 정렬 (by=0 : 첫 번째 열 기준 정렬)
	print(ds)

	0
	Age 0.198653
	Embarked 0.002245
	PassengerId 0.000000
	Survived 0.000000
	Pclass 0.000000
	Name 0.000000
	Sex 0.000000
	SibSp 0.000000
	Parch 0.000000
	Ticket 0.000000
	Fare 0.000000

	print(ds.index[0])

	Age

	import pandas as pd

	who = pd.read_csv('dataset.csv')

	# (1) 대상 컬럼 선택 후, 결측값이 있을 경우 제거하기
	who = who[["year", "country", "new_sp"]] # new_sp : 결핵 발생 건수
	who = who.dropna() # 결측값 제거
	print(who)

	year country new_sp
	17 1997 Afghanistan 618.0
	18 1998 Afghanistan 1833.0
	19 1999 Afghanistan 1669.0
	20 2000 Afghanistan 2892.0
	21 2001 Afghanistan 4639.0
	... ... ... ...
	8694 2008 Zimbabwe 9830.0
	8695 2009 Zimbabwe 10195.0
	8696 2010 Zimbabwe 11654.0
	8697 2011 Zimbabwe 12596.0
	8698 2012 Zimbabwe 12163.0

	[3902 rows x 3 columns]

	# (2) 2000년도에 국가별 결핵 발생 건수에 대한 평균 결핵 발생 건수 구하기
	crit1 = who["year"] == 2000
	who2000 = who[crit1]

	mean_cnt = np.mean(who2000["new_sp"])
	print(round(mean_cnt, 2)) # 소수점 둘째자리까지 출력

	7865.34

	# (3) 2000년도의 결핵 발생 건수가 2000년도 국가별 결핵 발생 건수에 대한 평균 결핵 발생 건수보다 결핵 발생 건수가 높은 국가의 개수 구하기
	crit2 = who2000["new_sp"] >= mean_cnt
	country = who2000[crit2]
	print(len(country))

	38

	import pandas as pd

	df = pd.read_csv("dataset.csv")
	print(df.describe())

	x y
	count 10.00000 10.000000
	mean 5.50000 58.500000
	std 3.02765 24.712795
	min 1.00000 15.000000
	25% 3.25000 40.250000
	50% 5.50000 60.000000
	75% 7.75000 77.000000
	max 10.00000 91.000000

	# (1) y 변수의 1사분위와 3사분위 값 구하기
	q1 = df['y'].quantile(0.25) # 1사분위 값
	q3 = df['y'].quantile(0.75) # 3사분위 값

	# (2) 3사분위수에서 1사분위수를 뺀 값 구하기
	print(int(abs(q3 - q1))) # (3) 소수점 이하는 버리고 정수로 출력하기

	36

	import pandas as pd

	f = pd.read_csv('dataset.csv')

	# (1) 긍정 평가 비율 구하기 (긍정 = 좋아요 + 놀랐어요)
	f['pos'] = (f['num_loves'] + f['num_wows']) / f['num_reactions']

	# (2) 긍정인 비율이 0.4 보다 크고 0.5 보다 작은 비디오 개수 구하기
	range = (f['pos'] > 0.4) & (f['pos'] < 0.5) # and 연산자 대신 & 연산자 사용 (비트 연산자)
	result = f[range]

	print(len(result))

	90

	import pandas as pd

	netflix = pd.read_csv('dataset.csv')

	# 추가된 날짜 확인하기
	print(netflix['date_added'].head())

	0 September 25, 2021
	1 September 24, 2021
	2 September 24, 2021
	3 September 24, 2021
	4 September 24, 2021
	Name: date_added, dtype: object

	# 2018년 1월 영국에서 단독으로 추가된 작품 고르기
	netflix['date_added'] = pd.to_datetime(netflix['date_added'], format="%B %d, %Y") # January 1, 2023

	print(netflix['date_added'])

	0 2021-09-25
	1 2021-09-24
	2 2021-09-24
	3 2021-09-24
	4 2021-09-24
	...
	8802 2019-11-20
	8803 2019-07-01
	8804 2019-11-01
	8805 2020-01-11
	8806 2019-03-02
	Name: date_added, Length: 8807, dtype: datetime64[ns]

	crit1 = netflix['country'] == "United Kingdom" # United Kingdom
	crit2 = netflix['date_added'].dt.year == 2018 # 2018년
	crit3 = netflix['date_added'].dt.month == 1 # 1월

	result = netflix[crit1 & crit2 & crit3] # 기준에 맞는 행만 저장
	print(len(result))

	6

	import pandas as pd

	waste = pd.read_csv('dataset.csv', encoding='euc-kr')
	print(waste.info())

	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 746 entries, 0 to 745
	Data columns (total 26 columns):
	# Column Non-Null Count Dtype
	--- ------ -------------- -----
	0 시도명 746 non-null object
	1 시군구명 746 non-null object
	2 종량제봉투종류 746 non-null object
	3 종량제봉투처리방식 746 non-null object
	4 종량제봉투용도 746 non-null object
	5 종량제봉투사용대상 746 non-null object
	6 1L가격 746 non-null int64
	7 1.5L가격 746 non-null int64
	8 2L가격 746 non-null int64
	9 2.5L가격 746 non-null int64
	10 3L가격 746 non-null int64
	11 5L가격 746 non-null int64
	12 10L가격 746 non-null int64
	13 20L가격 746 non-null int64
	14 30L가격 746 non-null int64
	15 50L가격 746 non-null int64
	16 60L가격 746 non-null int64
	17 75L가격 746 non-null int64
	18 100L가격 746 non-null int64
	19 120L가격 746 non-null int64
	...
	memory usage: 151.7+ KB
	None

	# 기준에 따른 데이터 추출하기
	crit1 = waste['종량제봉투종류'] == '규격봉투'
	crit2 = waste['종량제봉투용도'] == '음식물쓰레기'
	crit3 = waste['2L가격'] != 0 # 가격이 0인 것(구매하지 않은 것)을 제외하기

	df = waste[crit1 & crit2 & crit3]

	# 평균 가격 구하기
	mean_price = df['2L가격'].mean()
	print(int(mean_price)) # 정수값으로 출력하기

	120

	import pandas as pd

	body = pd.read_csv('dataset.csv')

	# BMI를 계산한 후 추가하기
	body['bmi'] = body['Weight'] / ((body['Height'] / 100) ** 2)

	# 체중 범위 구하기
	## 정상 체중
	crit1 = (body['bmi'] >= 18.5) & (body['bmi'] < 23)
	normal = body[crit1]
	cnt_normal = len(normal)
	print(cnt_normal)

	47

	## 위험 체중
	crit2 = (body['bmi'] >= 23) & (body['bmi'] < 25)
	danger = body[crit2]
	cnt_danger = len(danger)
	print(cnt_danger)

	19

	# 인원의 차이를 절댓값으로 구하기
	result = int(abs(cnt_normal - cnt_danger))
	print(result)

	28

	import pandas as pd

	df = pd.read_csv('dataset.csv', encoding='euc-kr')
	print(df.info())

	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 6287 entries, 0 to 6286
	Data columns (total 30 columns):
	# Column Non-Null Count Dtype
	--- ------ -------------- -----
	0 시도교육청 6287 non-null object
	1 지역교육청 6287 non-null object
	2 지역 6280 non-null object
	3 정보공시 학교코드 6287 non-null object
	4 학교명 6287 non-null object
	5 학교급코드 6287 non-null int64
	6 설립구분 6287 non-null object
	7 제외여부 6287 non-null object
	8 제외사유 0 non-null float64
	9 1학년 전입학생수 6287 non-null int64
	10 1학년 전출학생수 6287 non-null int64
	11 1학년 전체학생수 6287 non-null int64
	12 2학년 전입학생수 6287 non-null int64
	13 2학년 전출학생수 6287 non-null int64
	14 2학년 전체학생수 6287 non-null int64
	15 3학년 전입학생수 6287 non-null int64
	16 3학년 전출학생수 6287 non-null int64
	17 3학년 전체학생수 6287 non-null int64
	18 4학년 전입학생수 6287 non-null int64
	19 4학년 전출학생수 6287 non-null int64
	...
	memory usage: 1.4+ MB
	None

	# <순전입학생수> 구하기
	df['순전입학생수'] = df['전입학생수(계)'] - df['전출학생수(계)']
	df = df.sort_values(by="순전입학생수", ascending=False) # <순전입학생수> 열 기준으로 내림차순 정렬하기

	# 최댓값 뽑아내기 (순전입학생수가 가장)
	result = df['전체학생수(계)'].iloc[0] # 첫 번째 행 값 가져오기
	print(result)

	956

	신고일시 YYYY-MM-DD hh:mm
	출동일시 YYYY-MM-DD hh:mm
	출동소방서

	import pandas as pd
	from datetime import datetime

	df = pd.read_csv('dataset.csv')

	# 각 컬럼에 저장
	df['call_time'] = pd.to_datetime(df['신고일시'])
	df['arrive_time'] = pd.to_datetime(df['출동일시'])
	df['fire_station'] = df['출동소방서']

	# 시간 차이 구하기
	## .dt 속성은 datetime 형식의 열에 대해 날짜와 시간 구성 요소에 접근할 수 있게 해준다.
	df['diff_time'] = (df['arrive_time'] - df['call_time']).dt.total_seconds() # 차이를 초로 계산

	# 그룹화 한 후 내림차순 정렬하기
	df = df.groupby([df['fire_station'], df['call_time'].dt.year, df['call_time'].dt.month]).mean('diff_time') # diff_time 열에 대해서만 평균 계산
	df = df.sort_values('diff_time', ascending=False) # diff_time 열 기준으로 내림차순 정렬

	res_date = df['diff_time'].head(1) # 첫 번째 행만 선택 (diff_time이 제일 큰 항목)

	print(res_date)

	fire_station call_time call_time
	대화119안전센터 2018 9 3840.0
	초월119안전센터 2018 7 2925.0
	6 2865.0
	경기도소방재난본부 2018 8 2220.0
	10 1620.0
	...
	용문119안전센터 2018 2 45.0
	설악119안전센터 2018 6 45.0
	백암119안전센터 2018 12 40.0
	영북119안전센터 2018 3 0.0
	가평119안전센터 2018 6 0.0
	Name: diff_time, Length: 623, dtype: float64

	print(res_date.iloc[0])

	3840.0

	res_num = float(res_date.iloc[0]) / 60 # 초 단위로 저장된 res_date를 실수형으로 변환 후, 분단위인 60으로 나눔.
	result = round(res_num) # 반올림

	print(result)

	64

	import pandas as pd

	df = pd.read_csv('dataset.csv')

	# df 변수의 구조 파악
	print(df.info())

	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 6309 entries, 0 to 6308
	Data columns (total 8 columns):
	# Column Non-Null Count Dtype
	--- ------ -------------- -----
	0 school_name 6309 non-null object
	1 student_1 6309 non-null int64
	2 student_2 6309 non-null int64
	3 student_3 6309 non-null int64
	4 student_4 6309 non-null int64
	5 student_5 6309 non-null int64
	6 student_6 6309 non-null int64
	7 teacher 6309 non-null int64
	dtypes: int64(7), object(1)
	memory usage: 394.4+ KB
	None

	# df 변수의 요약 통계량 확인
	print(df.describe())

	student_1 student_2 student_3 student_4 student_5 \
	count 6309.000000 6309.000000 6309.000000 6309.000000 6309.000000
	mean 67.578697 66.378982 66.438897 73.728642 71.712474
	std 67.752782 65.398715 64.544496 70.324875 67.134313
	min 0.000000 0.000000 0.000000 0.000000 0.000000
	25% 10.000000 10.000000 10.000000 11.000000 11.000000
	50% 50.000000 49.000000 51.000000 57.000000 56.000000
	75% 104.000000 101.000000 103.000000 115.000000 112.000000
	max 470.000000 390.000000 401.000000 420.000000 393.000000

	student_6 teacher
	count 6309.000000 6309.000000
	mean 71.538120 23.925503
	std 66.448847 17.191077
	min 0.000000 0.000000
	25% 11.000000 8.000000
	50% 58.000000 20.000000
	75% 112.000000 35.000000
	max 428.000000 95.000000

	# 상위 5개 데이터 확인
	print(df.head())

	school_name student_1 student_2 student_3 student_4 student_5 \
	0 서울교육대학교부설초등학교 95 95 95 104 108
	1 서울대학교사범대학부설초등학교 97 100 102 102 102
	2 서울개일초등학교 183 163 176 186 177
	3 서울구룡초등학교 109 115 115 131 133
	4 서울논현초등학교 34 47 34 46 46

	student_6 teacher
	0 110 31
	1 104 36
	2 193 42
	3 132 38
	4 46 18

	# 교사 1인당 학생 수(학생 수 합계 / 교사 수) 구한 후 내림차순 정렬하기
	df["tch_std"] = (df["student_1"] + df["student_2"] + df["student_3"] + df["student_4"] + df["student_5"] + df["student_6"]) / df["teacher"]

	df_sort = df.sort_values("tch_std", ascending=False)
	df_sort["tch_std"].head() # 내림차순 정렬된 것 확인

	1044 36.500000
	3387 34.125000
	1464 32.333333
	360 32.041667
	687 31.652174
	Name: tch_std, dtype: float64

	# 제일 상단 데이터를 추출하고 변수에 저장
	result = df_sort["teacher"].head(1)
	print(result)

	1044 6
	Name: teacher, dtype: int64

	# 마지막 컬럼 1개만 추출하여 변수에 저장
	result = result.iloc[-1]
	print(result)

	6