구글 Colab Jupyter Notebook 사용 팁

Google Drive 연동 및 각종 사용 꿀팁 정리

Posted by 옐란 on 2021-03-23
  • keras framework로 2년간 google colab과 google drive를 이용해 작업시 사용하던 내용을 정리해본다.

목차

1
2
3
4
5
6
7
8
1. Google Drive file download-upload
2. Colab file upload/download
3. Colab tensorflow/keras downgrade
4. Colab file handling
5. Colab utility install
6. Colab github 프로젝트 연동
7. Colab upload 오류 대처
8. Colab에 주식관련 Lib 설치(Ta-Lib)

Google drive file download-upload

file download

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#구글 드라이브에서 다운로드
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')
import io
from io import BytesIO
from googleapiclient.http import MediaIoBaseDownload

TEMP_PATH = '/tmp/'
def gcp_download(file_name, key):
#3. 모델 다운로드
#https://drive.google.com/open?id=1TlvbayGRCjAI6bOZrUYMmv6g6b95rnRM
request = drive_service.files().get_media(fileId=key)

downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
status, done = downloader.next_chunk()
if status:
print("Download %%%d%%." % int(status.progress() * 100))
print("Download Complete!")
downloaded.seek(0)

with open(TEMP_PATH + file_name, 'wb') as f:
f.write(downloaded.read())

# 사용법 : 다운로드시 사용할 파일명, 파일key 정보만 있으면 됨!
down_file_name = '201912_purcon_gd.csv'
gcp_download(down_file_name, '1q-E4K439JJBV2HnqgbibHMhmNOAHO11v')

#pickle 파일 다운로드 및 로딩
import pickle
gd_tokenizer_name = 'gd_tokenizer_350K4_ALL.pickle'
gcp_download(gd_tokenizer_name, '15hcUEPYB-el1oNctRrhLqObuFKVvXJSV')
#loading
with open(TEMP_PATH + gd_tokenizer_name, 'rb') as handle:
gd_tokenizer = pickle.load(handle)

file upload

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

gcp_drive = None
def doGoodleDriveAuth():
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
print('gauth:',gauth)
gauth.credentials = GoogleCredentials.get_application_default()

gcp_drive = GoogleDrive(gauth)
print('gcp_drive:', gcp_drive)
return gcp_drive

# PyDrive reference:
# https://googledrive.github.io/PyDrive/docs/build/html/index.html
def gcp_upload(file_path, w_file_name):
try:
if gcp_drive == None:
drive = doGoodleDriveAuth()

# 특정 폴더 안으로 파일 삽입
uploaded = drive.CreateFile({'title': w_file_name}) #, "parents": [{"kind": "drive#fileLink","id": 'jukyellow@gmail.com'}]})
uploaded.SetContentString(w_file_name)
uploaded.SetContentFile(file_path + w_file_name)
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))
return uploaded.get('id')
except Exception as e: print('gcp_upload err:', e)

# 사용법
# 먼저, 파일을 100M단위로 분할 압축(100M 이상일때 한번에 안올라가는 문제있어서)
!zip -s 100M -o TextCNN_EarayStop.zip ./TextCNN_EarayStop.h5
# 루프돌면서 분할 파일 업로드(나중에 다시 합칠때는 다운로드 받고 압축푼다음 합칠수 있음)
try:
for idx in range(9):
if idx == 0: div_name = "TextCNN_EarayStop.zip"
else: div_name = "TextCNN_EarayStop.z0" + str(idx)
gcp_upload("./", div_name) #gcp root경로에 저장됨
except Exception as e : print('e:', e)

# 분할파일을 다시 내려받고 합칠때
model_name_early_stop = 'TextCNN_EarayStop_350K4.h5'
#gcp_download(model_name_4bu, '1T2Es7AO2FTYuMaq2A8-9Tc_g_mUKhitP') # temp 경로에 다운로드
gcp_download('TextCNN_EarayStop_350K4.zip','1RD9VCeLwKmZrZCP8SRRjwA4ADGPzZcZi')
gcp_download('TextCNN_EarayStop_350K4.z01','17RPrQiuPyx5eUATEX5FYMBSxKcao6X5w')
gcp_download('TextCNN_EarayStop_350K4.z02','1NrR7sHeVEcqlJT4bSAspx3RsTvbpiZkH')
gcp_download('TextCNN_EarayStop_350K4.z03','1nI0yli3HLCSRF3RVWTNgQCUQ0thhmviv')
#...
# 분할파일 머지 및 압축해제
!cat /tmp/TextCNN_EarayStop_350K4.z* > /tmp/TextCNN_EarayStop_Mer.zip
!unzip /tmp/TextCNN_EarayStop_Mer.zip

Colab file upload/download

colab file upload

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# 1. 직접 업로드( files.upload())
from google.colab import files
#파일업로드창 출력
uploaded = files.upload()
#업로드한 파일정보 출력
for fn in uploaded.keys():
print('Upload file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
#읽어서 사용
im = imread("output.jpg")

# 2.google drive업로드 후 로딩 방법

#2-1) 파일을 열어서 라인단위로 읽어서 저장하는 방법
#Pre-Trained data를 사용(find-tune)
embeddings_index = {}
# driver code
from google.colab import drive
import os
drive.mount('/content/gdrive')

f = open(os.path.join('/content/gdrive/My Drive', 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

# 2-2) dataPath 및 Gensim 이용해서 파일을 읽어오는 방법
# 2-2-1)
from gensim.test.utils import datapath, get_tmpfile
from google.colab import drive
drive.mount('/content/gdrive')

glove_file = datapath('/content/gdrive/My Drive/glove.6B.100d.txt')
xy = np.loadtxt( glove_file , delimiter=',', dtype=np.float32)

# 2-2-2)
from gensim.test.utils import datapath, get_tmpfile
feature_data = datapath('/content/gdrive/My Drive/AI/kaggle/' + '300features_40minwords_10text')

# 2-3) google drive경로에서 읽는 방법
# 2-3-1) image
from google.colab import drive
drive.mount('/content/gdrive')
from matplotlib.pyplot import imread

im = imread("/content/gdrive/My Drive/NLP-Lab/output.jpg")

# 2-3-2)txt read
from google.colab import drive
drive.mount('/content/gdrive')

#csv data read
xy = np.loadtxt('/content/gdrive/My Drive/data-01-test-score.csv', delimiter=',', dtype=np.float32)
#문자열인 경우 dtype=np.str
#google driave root 경로: /content/gdrive/My Drive/
x_data = xy[:, 0:-1]
y_data = xy[:, [-1]]
#단 chunk_size 초과로 오류발생 가능..

colab file download

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# A. pandas data 다운로드
import pandas as pd
pd.DataFrame(err_vec).to_csv("/tmp/preporcessing_case6.csv")

from google.colab import files
files.download("/tmp/preporcessing_case6.csv")

# B. Numpy 다운로드
import numpy
a = numpy.asarray([ [1,2,3], [4,5,6], [7,8,9] ])
numpy.savetxt("foo.csv", a, delimiter=",")

from google.colab import files
files.download("파일명")

Colab tensorflow/keras downgrade

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# tensorflow 1.x대 적용방법 1
%tensorflow_version 1.x
import tensorflow as tf
# 버전확인
print(tf.__version__)

# tensorflow 1.x대 적용방법2
!pip install tensorflow==1.14.0
# 특정 버전 설치? : pip3 install tensorflow-gpu==1.15.2

# keras version downgrade
!pip uninstall keras
!pip install keras==2.1.2
import kears
print(__keras.version__)

Colab file handling

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 디렉토리 조회
! ls -alrt /content/
# 파일 move
!mv /tmp/facenet_keras.h5 /content
# 파일 삭제
!\rm -rf /content/origin_data
# 디렉토리 생성
!mkdir /content/origin_data
# 압축관련 설치
!apt-get install zip unzip
#압축 100M이하로!
!zip -s 100M -o TextCNN_GCN_10bu.zip ./TextCNN_GCN_10bu.h5
# -r:하위경로 모두 포함하여 압축, 현재경로에 zip파일 만들기(대상은 절대경로로)
!zip -r dataset_zip.zip /content/dataset
# 압축풀기 : 한글깨짐 방지: -O cp949
!unzip -O cp949 /tmp/00_Total.zip
!rm -rf /tmp/00_Total.zip

Colab utility install

1
2
!pip uninstall google-api-python-client -y
!pip install google-api-python-client==1.7.3

Colab github 프로젝트 연동

1
2
3
4
5
6
7
8
9
10
11
12
!git clone https://github.com/philipperemy/keras-attention-mechanism.git

!ls -alrt

!cd keras-attention-mechanism

import numpy as np
from attention_utils import get_activations, get_data

np.random.seed(1337) # for reproducibility
from keras.models import *
from keras.layers import Input, Dense, merge

Colab upload 오류 대처

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 오류내용 : RedirectMissingLocation: Redirected but the response is missing a Location: header.

#버전 확인
#!pip list
#httplib2 0.17.1
#google-api-python-client 1.7.12

# 방안 : downgrade (https://github.com/tokland/youtube-upload/issues/293)
# !pip3 install google-api-python-client==1.7.3 oauth2client==4.1.2 progressbar2==3.38.0 httplib2==0.15.0
!pip uninstall google-api-python-client -y
!pip uninstall oauth2client -y
!pip uninstall progressbar2 -y
!pip uninstall httplib2 -y
!pip install google-api-python-client==1.7.3
!pip install oauth2client==4.1.2
!pip install progressbar2==3.38.0
!pip install httplib2==0.15.0

Colab 주식관련 Lib 설치(Ta-Lib)

1
2
3
4
5
6
7
# (2020) https://stackoverflow.com/questions/49648391/how-to-install-ta-lib-in-google-colab
url = 'https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files'
!wget $url/libta-lib0_0.4.0-oneiric1_amd64.deb -qO libta.deb
!wget $url/ta-lib0-dev_0.4.0-oneiric1_amd64.deb -qO ta.deb
!dpkg -i libta.deb ta.deb
!pip install ta-lib
import talib