Monte Carlo Method 몬테카를로 학습 구현

#Monte Carlo Method 몬테카를로 학습 구현
#SAINT Lab. Q1 [강화학습]
#60201969 이유현 [2024.01.23]
import random #랜덤 에이전트 구현 목적 라이브러리

#환경
class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
        
    #제일 중요한 함수
    #에이전트로부터 액션을 받아서 상태 변이를 일으키고, 보상을 정해줌
    def step(self, a):
        if a==0:
            self.move_right()
        elif a==1:
            self.move_left()
        elif a==2:
            self.move_up()
        elif a==3:
            self.move_down()
        
        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done
    
    def move_right(self):
        self.y += 1
        if self.y > 3:
            self.y = 3
    
    def move_left(self):
        self.y -= 1
        if self.y < 0:
            self.y = 0
    
    def move_up(self):
        self.x -= 1
        if self.x < 0:
            self.x = 0
    
    def move_down(self):
        self.x += 1
        if self.x > 3:
            self.x = 3
       
    #에피소드가 끝났는지 판별해주는 함수     
    def is_done(self):
        if self.x == 3 and self.y == 3: #종료상태 도달
            return True
        else:
            return False
        
    def get_state(self):
        return (self.x, self.y)
    
    #종료 상태 도달 시 처음으로 리셋
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

#에이전트    
class Agent():
    def __init__(self):
        pass
    
    #랜덤 액션 선택
    def select_action(self):
        coin = random.random()
        if coin < 0.25:
            action = 0
        elif coin < 0.5:
            action = 1
        elif coin < 0.75:
            action = 2
        else:
            action = 3
        return action

#본격적으로 학습을 하는 메인 함수
def main():
    env = GridWorld() #환경 클래스의 인스턴스
    agent = Agent()
    data = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]] #테이블 초기화
    gamma = 1.0 #감쇠 인자
    alpha = 0.0001 #업데이트할 때 사용하는 파라미터
    
    for k in range(50000): #총 5만 번의 에피소드 진행
        done = False
        history = []
        while not done: #랜덤 에이전트가 경험을 쌓는 과정
            action = agent.select_action()
            (x,y), reward, done = env.step(action)
            history.append((x,y,reward))
        env.reset()
        
        #매 애피소드가 끝나고 바로 해당 데이터를 이용해 테이블을 업데이트
        cum_reward = 0
        for transition in history[::-1]: #방문했던 상태들을 뒤에서부터 보며 차례차례 리턴을 계산 / 업데이트
            x, y, reward = transition
            data[x][y] = data[x][y] + alpha * (cum_reward - data[x][y])
            cum_reward = reward + gamma * cum_reward
            
    #학습이 끝나고 난 후 데이터를 출력해보기 위한 코드
    for row in data:
        print(row)
        
main()

PS C:\\Users\\yuhyu\\Desktop\\CODE>  & 'C:\\Users\\yuhyu\\AppData\\Local\\Programs\\Python\\Python311\\python.exe' 'c:\\Users\\yuhyu\\.vscode\\extensions\\ms-python.python-2023.22.1\\pythonFiles\\lib\\python\\debugpy\\adapter/../..\\debugpy\\launcher' '55366' '--' 'C:\\Users\\yuhyu\\Desktop\\CODE\\Monte_Carlo_Method.py'
[-58.10720180108525, -56.12318226310994, -52.0520116993405, -50.27044865210934]
[-56.418550032727, -53.406277400359116, -48.33834916636482, -44.300056140512275]
[-53.8443301974015, -49.32410092480804, -41.06965096350658, -29.4641804456401]  
[-51.48295145534912, -45.49482559809898, -30.66538245627557, 0.0]

Temporal Difference 시간적 차이 학습 구현

#Temporal Difference 시간적 차이 학습 구현
#SAINT Lab. Q1 [강화학습]
#60201969 이유현 [2024.01.23]
import random #랜덤 에이전트 구현 목적 라이브러리

#환경
class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
        
    #제일 중요한 함수
    #에이전트로부터 액션을 받아서 상태 변이를 일으키고, 보상을 정해줌
    def step(self, a):
        if a==0:
            self.move_right()
        elif a==1:
            self.move_left()
        elif a==2:
            self.move_up()
        elif a==3:
            self.move_down()
        
        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done
    
    def move_right(self):
        self.y += 1
        if self.y > 3:
            self.y = 3
    
    def move_left(self):
        self.y -= 1
        if self.y < 0:
            self.y = 0
    
    def move_up(self):
        self.x -= 1
        if self.x < 0:
            self.x = 0
    
    def move_down(self):
        self.x += 1
        if self.x > 3:
            self.x = 3
       
    #에피소드가 끝났는지 판별해주는 함수     
    def is_done(self):
        if self.x == 3 and self.y == 3: #종료상태 도달
            return True
        else:
            return False
        
    def get_state(self):
        return (self.x, self.y)
    
    #종료 상태 도달 시 처음으로 리셋
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

#에이전트    
class Agent():
    def __init__(self):
        pass
    
    #랜덤 액션 선택
    def select_action(self):
        coin = random.random()
        if coin < 0.25:
            action = 0
        elif coin < 0.5:
            action = 1
        elif coin < 0.75:
            action = 2
        else:
            action = 3
        return action

#본격적으로 학습을 하는 메인 함수
def main():
    env = GridWorld() #환경 클래스의 인스턴스
    agent = Agent()
    data = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]] #테이블 초기화
    gamma = 1.0 #감쇠 인자
    alpha = 0.01 #MC에 비해 큰 값을 사용: TD가 MC에 비해 학습의 변동성이 작은 덕분에 큰 폭의 업데이트가 가능
    
    for k in range(50000): #총 5만 번의 에피소드 진행
        done = False
        history = []
        while not done: #랜덤 에이전트가 경험을 쌓는 과정
            x, y = env.get_state()
            action = agent.select_action()
            (x_prime, y_prime), reward, done = env.step(action)
            x_prime, y_prime = env.get_state()
            
            #한 번의 step이 진행되자 마자 바로 테이블의 데이터를 업데이트 해줌
            data[x][y] = data[x][y] + alpha * (reward + gamma * data[x_prime][y_prime] - data[x][y])
        env.reset()
            
    #학습이 끝나고 난 후 데이터를 출력해보기 위한 코드
    for row in data:
        print(row)
        
main()

PS C:\\Users\\yuhyu\\Desktop\\CODE> & C:/Users/yuhyu/AppData/Local/Programs/Python/Python311/python.exe c:/Users/yuhyu/Desktop/CODE/Temporal_Difference.py
[-59.600843535150226, -57.61812381534683, -54.225961046262995, -51.85792867871927]
[-57.86248000315834, -55.1537759404265, -50.40931469255334, -45.62909009382938]   
[-54.562002551037054, -49.95594540191525, -41.46912085898049, -31.099402937538297]
[-51.488690434234314, -45.21546182004205, -29.788365212148257, 0]

Monte_Carlo_Method.py

Temporal_Difference.py