Shortlink: goo.gl/wSuuS9
The github repository will soon be available at github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wikisum
'system': | |
[ | |
{ | |
'type': 'text', | |
'text': "You are Claude Code, Anthropic's official CLI for Claude.", | |
'cache_control': {'type': 'ephemeral'} | |
}, | |
{ | |
'type': 'text', | |
'text': 'You are an interactive CLI tool that helps users with software engineering tasks. |
# train_grpo.py | |
# | |
# See https://github.com/willccbb/verifiers for ongoing developments | |
# | |
import re | |
import torch | |
from datasets import load_dataset, Dataset | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from peft import LoraConfig | |
from trl import GRPOConfig, GRPOTrainer |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
# helpers | |
def make_unit_length(x, epsilon=1e-6): | |
norm = x.norm(p=2, dim=-1, keepdim=True) | |
return x.div(norm + epsilon) |
Shortlink: goo.gl/wSuuS9
The github repository will soon be available at github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wikisum
import random | |
class TicTacToe: | |
def __init__(self, playerX, playerO): | |
self.board = [' ']*9 | |
self.playerX, self.playerO = playerX, playerO | |
self.playerX_turn = random.choice([True, False]) | |
def play_game(self): |
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """ | |
import numpy as np | |
import cPickle as pickle | |
import gym | |
# hyperparameters | |
H = 200 # number of hidden layer neurons | |
batch_size = 10 # every how many episodes to do a param update? | |
learning_rate = 1e-4 | |
gamma = 0.99 # discount factor for reward |
#!/usr/bin/env python | |
# coding: utf-8 | |
"""Sampling Sequence Data from model""" | |
import numpy as np | |
import tensorflow as tf | |
import json | |
import cPickle as pickle | |
import itertools as it | |
from rnnlib import PTBModel |