from textrl import TextRLEnv,TextRLActor
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelWithLMHead
import logging
import sys
import pfrl
import torch
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

Using a pre-trained model, it can generate elonmusk's style tweets.

tokenizer = AutoTokenizer.from_pretrained("huggingtweets/elonmusk")  
model = AutoModelWithLMHead.from_pretrained("huggingtweets/elonmusk")
model.eval()
model.cuda()
/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:902: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.
  FutureWarning,
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (2): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (3): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (4): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (5): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (6): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (7): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (8): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (9): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (10): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (11): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

a sentiment classifier for rl reward

sentiment = pipeline('sentiment-analysis',model="cardiffnlp/twitter-roberta-base-sentiment",tokenizer="cardiffnlp/twitter-roberta-base-sentiment",device=0,return_all_scores=True)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.CRITICAL)
sentiment("dogecoin is bad")
[[{'label': 'LABEL_0', 'score': 0.9338533878326416},
  {'label': 'LABEL_1', 'score': 0.060118917375802994},
  {'label': 'LABEL_2', 'score': 0.0060277231968939304}]]
sentiment("dogecoin is bad")[0][0]['score']
0.9338533878326416

set our text generation reward, inverse perplexity + sentiment classifier.

  • inverse perplexity make sure the generated sentence probability will be high.
  • sentiment classifier can make the generate more negative.
class MyRLEnv(TextRLEnv):
    def get_reward(self, input_text, predicted_list, finish): # predicted will be the list of predicted token
      reward = 0
      if finish:
        if 1 < len(predicted_list) < 50:
          predicted_text = tokenizer.convert_tokens_to_string(predicted_list)
          # inverse perplexity
          inputs = tokenizer(input_text+predicted_text,return_tensors='pt').to('cuda')
          reward += (1/(torch.exp(model(**inputs, labels=inputs["input_ids"]).loss).mean().item()))
          # sentiment classifier
          reward += sentiment(predicted_text)[0][0]['score']
      return reward

fit one example

observaton_list = ['i think dogecoin is']
env = MyRLEnv(model, tokenizer, observation_input=observaton_list)
actor = TextRLActor(env,model,tokenizer)
agent = actor.agent_ppo(update_interval=10, minibatch_size=2000, epochs=20)
actor.predict('i think dogecoin is')
' a great idea.'
pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=500,
    eval_n_steps=None,
    eval_n_episodes=1,       
    train_max_episode_len=50,  
    eval_interval=10,
    outdir='elon_musk_dogecoin', 
)
outdir:elon_musk_dogecoin step:28 episode:0 R:0.11256813772978423
statistics:[('average_value', -1.1820644), ('average_entropy', 71312.86), ('average_value_loss', 1.5397733084180139), ('average_policy_loss', -0.0005778993137496537), ('n_updates', 33), ('explained_variance', -36.946850889108426)]
evaluation episode 0 length:15 R:0.6118701571341162
The best score is updated -3.4028235e+38 -> 0.6118701571341162
Saved the agent to elon_musk_dogecoin/best
outdir:elon_musk_dogecoin step:33 episode:1 R:0.04733136688125236
statistics:[('average_value', -1.1896302), ('average_entropy', 71312.86), ('average_value_loss', 1.5780798437840797), ('average_policy_loss', -0.0006748329104833401), ('n_updates', 34), ('explained_variance', -0.7300913744865904)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:38 episode:2 R:0.051060883339943464
statistics:[('average_value', -1.1958332), ('average_entropy', 71312.86), ('average_value_loss', 1.5780798437840797), ('average_policy_loss', -0.0006748329104833401), ('n_updates', 34), ('explained_variance', -0.7300913744865904)]
outdir:elon_musk_dogecoin step:87 episode:3 R:0.1912596257209756
statistics:[('average_value', -1.2554632), ('average_entropy', 71312.94), ('average_value_loss', 1.5440955738990734), ('average_policy_loss', -0.0002805701943333798), ('n_updates', 39), ('explained_variance', -390.4063933607259)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:103 episode:4 R:0.5811674822406133
statistics:[('average_value', -1.2457539), ('average_entropy', 71312.98), ('average_value_loss', 1.5156753742840232), ('average_policy_loss', -0.0002032094211821592), ('n_updates', 41), ('explained_variance', -51.0615278646253)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:112 episode:5 R:0.06606750377697743
statistics:[('average_value', -1.2375408), ('average_entropy', 71312.99), ('average_value_loss', 1.5108967859830176), ('average_policy_loss', -8.875665203266267e-05), ('n_updates', 42), ('explained_variance', -1.4907311649594996)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:117 episode:6 R:0.04733136688125236
statistics:[('average_value', -1.2344139), ('average_entropy', 71313.01), ('average_value_loss', 1.5108967859830176), ('average_policy_loss', -8.875665203266267e-05), ('n_updates', 42), ('explained_variance', -1.4907311649594996)]
outdir:elon_musk_dogecoin step:133 episode:7 R:0.055373401821994093
statistics:[('average_value', -1.227982), ('average_entropy', 71313.06), ('average_value_loss', 1.493516572158445), ('average_policy_loss', -8.711994806534759e-05), ('n_updates', 44), ('explained_variance', -78.9884378834259)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:148 episode:8 R:0.13826534376486688
statistics:[('average_value', -1.2197058), ('average_entropy', 71313.1), ('average_value_loss', 1.4825191640191608), ('average_policy_loss', 0.00019809898391637642), ('n_updates', 45), ('explained_variance', -33.81824737099633)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:198 episode:9 R:0
statistics:[('average_value', -1.1569879), ('average_entropy', 71313.22), ('average_value_loss', 1.3934581258893013), ('average_policy_loss', 0.00033982125845795965), ('n_updates', 50), ('explained_variance', -431.32679512670745)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:213 episode:10 R:0.06352318183982744
statistics:[('average_value', -1.136288), ('average_entropy', 71313.25), ('average_value_loss', 1.3678271796267767), ('average_policy_loss', 0.00024142916698530407), ('n_updates', 52), ('explained_variance', -32.041422884933674)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:216 episode:11 R:0.026587595267637088
statistics:[('average_value', -1.1333122), ('average_entropy', 71313.25), ('average_value_loss', 1.3678271796267767), ('average_policy_loss', 0.00024142916698530407), ('n_updates', 52), ('explained_variance', -32.041422884933674)]
outdir:elon_musk_dogecoin step:221 episode:12 R:0.04169077246803829
statistics:[('average_value', -1.1245402), ('average_entropy', 71313.266), ('average_value_loss', 1.3501226932934995), ('average_policy_loss', 0.0001948871455547391), ('n_updates', 53), ('explained_variance', -370.5535500115619)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:229 episode:13 R:0.04640172880556339
statistics:[('average_value', -1.1159837), ('average_entropy', 71313.27), ('average_value_loss', 1.3501226932934995), ('average_policy_loss', 0.0001948871455547391), ('n_updates', 53), ('explained_variance', -370.5535500115619)]
outdir:elon_musk_dogecoin step:234 episode:14 R:0.051060883339943464
statistics:[('average_value', -1.1078151), ('average_entropy', 71313.29), ('average_value_loss', 1.3338033885315612), ('average_policy_loss', 0.00016046616851613012), ('n_updates', 54), ('explained_variance', -155.0251960693383)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:239 episode:15 R:0.04733136688125236
statistics:[('average_value', -1.0999826), ('average_entropy', 71313.29), ('average_value_loss', 1.3338033885315612), ('average_policy_loss', 0.00016046616851613012), ('n_updates', 54), ('explained_variance', -155.0251960693383)]
outdir:elon_musk_dogecoin step:247 episode:16 R:0.04414926498259348
statistics:[('average_value', -1.0872896), ('average_entropy', 71313.305), ('average_value_loss', 1.3130737553943288), ('average_policy_loss', 0.00023882706174679861), ('n_updates', 55), ('explained_variance', -133.39454800252926)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:250 episode:17 R:0.026587595267637088
statistics:[('average_value', -1.0839288), ('average_entropy', 71313.31), ('average_value_loss', 1.3130737553943288), ('average_policy_loss', 0.00023882706174679861), ('n_updates', 55), ('explained_variance', -133.39454800252926)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:255 episode:18 R:0.051060883339943464
statistics:[('average_value', -1.075243), ('average_entropy', 71313.32), ('average_value_loss', 1.2955336006624358), ('average_policy_loss', 0.0002585244264342431), ('n_updates', 56), ('explained_variance', -19.625075793552828)]
outdir:elon_musk_dogecoin step:260 episode:19 R:0.03691395205937924
statistics:[('average_value', -1.0684801), ('average_entropy', 71313.336), ('average_value_loss', 1.2955336006624358), ('average_policy_loss', 0.0002585244264342431), ('n_updates', 56), ('explained_variance', -19.625075793552828)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:265 episode:20 R:0.04733136688125236
statistics:[('average_value', -1.0598416), ('average_entropy', 71313.336), ('average_value_loss', 1.2769644127080315), ('average_policy_loss', 0.00020445701330461082), ('n_updates', 57), ('explained_variance', -11.145038702421743)]
outdir:elon_musk_dogecoin step:270 episode:21 R:0.043883707944988505
statistics:[('average_value', -1.050933), ('average_entropy', 71313.35), ('average_value_loss', 1.2769644127080315), ('average_policy_loss', 0.00020445701330461082), ('n_updates', 57), ('explained_variance', -11.145038702421743)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:275 episode:22 R:0.051060883339943464
statistics:[('average_value', -1.0417138), ('average_entropy', 71313.35), ('average_value_loss', 1.2577419260452534), ('average_policy_loss', 0.0003183023898851263), ('n_updates', 58), ('explained_variance', -7.749529898995231)]
outdir:elon_musk_dogecoin step:282 episode:23 R:0.9078032784337866
statistics:[('average_value', -1.033803), ('average_entropy', 71313.37), ('average_value_loss', 1.2460979980937505), ('average_policy_loss', 0.00021025619966074651), ('n_updates', 59), ('explained_variance', -65.13956917530275)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:294 episode:24 R:0.06501227370126977
statistics:[('average_value', -1.0146691), ('average_entropy', 71313.39), ('average_value_loss', 1.233799797296524), ('average_policy_loss', 0.00013127381516824243), ('n_updates', 60), ('explained_variance', -5.447836198833656)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:299 episode:25 R:0.03691395205937924
statistics:[('average_value', -1.0066292), ('average_entropy', 71313.4), ('average_value_loss', 1.233799797296524), ('average_policy_loss', 0.00013127381516824243), ('n_updates', 60), ('explained_variance', -5.447836198833656)]
outdir:elon_musk_dogecoin step:307 episode:26 R:0.030314867869480518
statistics:[('average_value', -0.99085534), ('average_entropy', 71313.414), ('average_value_loss', 1.2154734047954199), ('average_policy_loss', 0.0001340141230893558), ('n_updates', 61), ('explained_variance', -10.193662811242667)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:313 episode:27 R:0.04454559288270535
statistics:[('average_value', -0.97872937), ('average_entropy', 71313.42), ('average_value_loss', 1.2004664507844756), ('average_policy_loss', -3.1615591362952405e-06), ('n_updates', 62), ('explained_variance', -33.175360647756264)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:318 episode:28 R:0.903268185582095
statistics:[('average_value', -0.97099626), ('average_entropy', 71313.43), ('average_value_loss', 1.2004664507844756), ('average_policy_loss', -3.1615591362952405e-06), ('n_updates', 62), ('explained_variance', -33.175360647756264)]
outdir:elon_musk_dogecoin step:324 episode:29 R:0.039902052647162366
statistics:[('average_value', -0.95829123), ('average_entropy', 71313.44), ('average_value_loss', 1.1893363548886209), ('average_policy_loss', -0.00011990143655797482), ('n_updates', 63), ('explained_variance', -1.5340383131866893)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:327 episode:30 R:0.027468392345032608
statistics:[('average_value', -0.9532474), ('average_entropy', 71313.445), ('average_value_loss', 1.1893363548886209), ('average_policy_loss', -0.00011990143655797482), ('n_updates', 63), ('explained_variance', -1.5340383131866893)]
outdir:elon_musk_dogecoin step:377 episode:31 R:0
statistics:[('average_value', -0.85051185), ('average_entropy', 71313.5), ('average_value_loss', 1.242111551520579), ('average_policy_loss', -0.0001777062998906545), ('n_updates', 68), ('explained_variance', -888.0687453298063)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:382 episode:32 R:0.03917782313665549
statistics:[('average_value', -0.8399318), ('average_entropy', 71313.51), ('average_value_loss', 1.235224168477715), ('average_policy_loss', -0.0003291908744385169), ('n_updates', 69), ('explained_variance', -0.3201928154205531)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:398 episode:33 R:0.042070292485708205
statistics:[('average_value', -0.8181706), ('average_entropy', 71313.52), ('average_value_loss', 1.2843311149094785), ('average_policy_loss', -0.0002839365799445659), ('n_updates', 70), ('explained_variance', -0.2696518433590207)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:401 episode:34 R:0.026587595267637088
statistics:[('average_value', -0.8137186), ('average_entropy', 71313.516), ('average_value_loss', 1.2754165699154558), ('average_policy_loss', -0.00030307473145871306), ('n_updates', 71), ('explained_variance', -6376.306062245339)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:435 episode:35 R:0.40446967288710806
statistics:[('average_value', -0.77308524), ('average_entropy', 71313.54), ('average_value_loss', 1.2388115018807553), ('average_policy_loss', -0.0004507025736824584), ('n_updates', 74), ('explained_variance', -51.03049916475539)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:453 episode:36 R:0.08308987630781423
statistics:[('average_value', -0.7523759), ('average_entropy', 71313.54), ('average_value_loss', 1.2134499484182972), ('average_policy_loss', -0.000460378126406699), ('n_updates', 76), ('explained_variance', -86.34497778856294)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:458 episode:37 R:0.043883707944988505
statistics:[('average_value', -0.74646515), ('average_entropy', 71313.54), ('average_value_loss', 1.2134499484182972), ('average_policy_loss', -0.000460378126406699), ('n_updates', 76), ('explained_variance', -86.34497778856294)]
outdir:elon_musk_dogecoin step:463 episode:38 R:0.051060883339943464
statistics:[('average_value', -0.74073523), ('average_entropy', 71313.54), ('average_value_loss', 1.1993875744474398), ('average_policy_loss', -0.00037824241489825116), ('n_updates', 77), ('explained_variance', -22.64250657127632)]
evaluation episode 0 length:5 R:0.051060883339943464
outdir:elon_musk_dogecoin step:469 episode:39 R:0.03801414681234764
statistics:[('average_value', -0.73418707), ('average_entropy', 71313.54), ('average_value_loss', 1.1993875744474398), ('average_policy_loss', -0.00037824241489825116), ('n_updates', 77), ('explained_variance', -22.64250657127632)]
outdir:elon_musk_dogecoin step:500 episode:40 R:0
statistics:[('average_value', -0.7161389), ('average_entropy', 71313.53), ('average_value_loss', 1.1581365015823395), ('average_policy_loss', -0.0007426668620610144), ('n_updates', 80), ('explained_variance', -628.3890231942744)]
evaluation episode 0 length:5 R:0.051060883339943464
Saved the agent to elon_musk_dogecoin/500_finish
(<pfrl.agents.ppo.PPO at 0x7f1438408f10>,
 [{'average_entropy': 71312.86,
   'average_policy_loss': -0.0005778993137496537,
   'average_value': -1.1820644,
   'average_value_loss': 1.5397733084180139,
   'eval_score': 0.6118701571341162,
   'explained_variance': -36.946850889108426,
   'n_updates': 33},
  {'average_entropy': 71312.86,
   'average_policy_loss': -0.0006748329104833401,
   'average_value': -1.1896302,
   'average_value_loss': 1.5780798437840797,
   'eval_score': 0.051060883339943464,
   'explained_variance': -0.7300913744865904,
   'n_updates': 34},
  {'average_entropy': 71312.94,
   'average_policy_loss': -0.0002805701943333798,
   'average_value': -1.2554632,
   'average_value_loss': 1.5440955738990734,
   'eval_score': 0.051060883339943464,
   'explained_variance': -390.4063933607259,
   'n_updates': 39},
  {'average_entropy': 71312.98,
   'average_policy_loss': -0.0002032094211821592,
   'average_value': -1.2457539,
   'average_value_loss': 1.5156753742840232,
   'eval_score': 0.051060883339943464,
   'explained_variance': -51.0615278646253,
   'n_updates': 41},
  {'average_entropy': 71312.99,
   'average_policy_loss': -8.875665203266267e-05,
   'average_value': -1.2375408,
   'average_value_loss': 1.5108967859830176,
   'eval_score': 0.051060883339943464,
   'explained_variance': -1.4907311649594996,
   'n_updates': 42},
  {'average_entropy': 71313.06,
   'average_policy_loss': -8.711994806534759e-05,
   'average_value': -1.227982,
   'average_value_loss': 1.493516572158445,
   'eval_score': 0.051060883339943464,
   'explained_variance': -78.9884378834259,
   'n_updates': 44},
  {'average_entropy': 71313.1,
   'average_policy_loss': 0.00019809898391637642,
   'average_value': -1.2197058,
   'average_value_loss': 1.4825191640191608,
   'eval_score': 0.051060883339943464,
   'explained_variance': -33.81824737099633,
   'n_updates': 45},
  {'average_entropy': 71313.22,
   'average_policy_loss': 0.00033982125845795965,
   'average_value': -1.1569879,
   'average_value_loss': 1.3934581258893013,
   'eval_score': 0.051060883339943464,
   'explained_variance': -431.32679512670745,
   'n_updates': 50},
  {'average_entropy': 71313.25,
   'average_policy_loss': 0.00024142916698530407,
   'average_value': -1.136288,
   'average_value_loss': 1.3678271796267767,
   'eval_score': 0.051060883339943464,
   'explained_variance': -32.041422884933674,
   'n_updates': 52},
  {'average_entropy': 71313.266,
   'average_policy_loss': 0.0001948871455547391,
   'average_value': -1.1245402,
   'average_value_loss': 1.3501226932934995,
   'eval_score': 0.051060883339943464,
   'explained_variance': -370.5535500115619,
   'n_updates': 53},
  {'average_entropy': 71313.29,
   'average_policy_loss': 0.00016046616851613012,
   'average_value': -1.1078151,
   'average_value_loss': 1.3338033885315612,
   'eval_score': 0.051060883339943464,
   'explained_variance': -155.0251960693383,
   'n_updates': 54},
  {'average_entropy': 71313.305,
   'average_policy_loss': 0.00023882706174679861,
   'average_value': -1.0872896,
   'average_value_loss': 1.3130737553943288,
   'eval_score': 0.051060883339943464,
   'explained_variance': -133.39454800252926,
   'n_updates': 55},
  {'average_entropy': 71313.31,
   'average_policy_loss': 0.00023882706174679861,
   'average_value': -1.0839288,
   'average_value_loss': 1.3130737553943288,
   'eval_score': 0.051060883339943464,
   'explained_variance': -133.39454800252926,
   'n_updates': 55},
  {'average_entropy': 71313.336,
   'average_policy_loss': 0.0002585244264342431,
   'average_value': -1.0684801,
   'average_value_loss': 1.2955336006624358,
   'eval_score': 0.051060883339943464,
   'explained_variance': -19.625075793552828,
   'n_updates': 56},
  {'average_entropy': 71313.35,
   'average_policy_loss': 0.00020445701330461082,
   'average_value': -1.050933,
   'average_value_loss': 1.2769644127080315,
   'eval_score': 0.051060883339943464,
   'explained_variance': -11.145038702421743,
   'n_updates': 57},
  {'average_entropy': 71313.37,
   'average_policy_loss': 0.00021025619966074651,
   'average_value': -1.033803,
   'average_value_loss': 1.2460979980937505,
   'eval_score': 0.051060883339943464,
   'explained_variance': -65.13956917530275,
   'n_updates': 59},
  {'average_entropy': 71313.39,
   'average_policy_loss': 0.00013127381516824243,
   'average_value': -1.0146691,
   'average_value_loss': 1.233799797296524,
   'eval_score': 0.051060883339943464,
   'explained_variance': -5.447836198833656,
   'n_updates': 60},
  {'average_entropy': 71313.414,
   'average_policy_loss': 0.0001340141230893558,
   'average_value': -0.99085534,
   'average_value_loss': 1.2154734047954199,
   'eval_score': 0.051060883339943464,
   'explained_variance': -10.193662811242667,
   'n_updates': 61},
  {'average_entropy': 71313.42,
   'average_policy_loss': -3.1615591362952405e-06,
   'average_value': -0.97872937,
   'average_value_loss': 1.2004664507844756,
   'eval_score': 0.051060883339943464,
   'explained_variance': -33.175360647756264,
   'n_updates': 62},
  {'average_entropy': 71313.44,
   'average_policy_loss': -0.00011990143655797482,
   'average_value': -0.95829123,
   'average_value_loss': 1.1893363548886209,
   'eval_score': 0.051060883339943464,
   'explained_variance': -1.5340383131866893,
   'n_updates': 63},
  {'average_entropy': 71313.5,
   'average_policy_loss': -0.0001777062998906545,
   'average_value': -0.85051185,
   'average_value_loss': 1.242111551520579,
   'eval_score': 0.051060883339943464,
   'explained_variance': -888.0687453298063,
   'n_updates': 68},
  {'average_entropy': 71313.51,
   'average_policy_loss': -0.0003291908744385169,
   'average_value': -0.8399318,
   'average_value_loss': 1.235224168477715,
   'eval_score': 0.051060883339943464,
   'explained_variance': -0.3201928154205531,
   'n_updates': 69},
  {'average_entropy': 71313.52,
   'average_policy_loss': -0.0002839365799445659,
   'average_value': -0.8181706,
   'average_value_loss': 1.2843311149094785,
   'eval_score': 0.051060883339943464,
   'explained_variance': -0.2696518433590207,
   'n_updates': 70},
  {'average_entropy': 71313.516,
   'average_policy_loss': -0.00030307473145871306,
   'average_value': -0.8137186,
   'average_value_loss': 1.2754165699154558,
   'eval_score': 0.051060883339943464,
   'explained_variance': -6376.306062245339,
   'n_updates': 71},
  {'average_entropy': 71313.54,
   'average_policy_loss': -0.0004507025736824584,
   'average_value': -0.77308524,
   'average_value_loss': 1.2388115018807553,
   'eval_score': 0.051060883339943464,
   'explained_variance': -51.03049916475539,
   'n_updates': 74},
  {'average_entropy': 71313.54,
   'average_policy_loss': -0.000460378126406699,
   'average_value': -0.7523759,
   'average_value_loss': 1.2134499484182972,
   'eval_score': 0.051060883339943464,
   'explained_variance': -86.34497778856294,
   'n_updates': 76},
  {'average_entropy': 71313.54,
   'average_policy_loss': -0.00037824241489825116,
   'average_value': -0.74073523,
   'average_value_loss': 1.1993875744474398,
   'eval_score': 0.051060883339943464,
   'explained_variance': -22.64250657127632,
   'n_updates': 77},
  {'average_entropy': 71313.53,
   'average_policy_loss': -0.0007426668620610144,
   'average_value': -0.7161389,
   'average_value_loss': 1.1581365015823395,
   'eval_score': 0.051060883339943464,
   'explained_variance': -628.3890231942744,
   'n_updates': 80}])

loading the best result and predict.

agent.load("./elon_musk_dogecoin/best")
actor.predict('i think dogecoin is')
' a great idea, but I think it is a little overused.'