!pip install pfrl@git+https://github.com/voidful/pfrl.git
!pip install textrl==0.1.6
from textrl import TextRLEnv,TextRLActor
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelWithLMHead
import logging
import sys
import pfrl
import torch
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

Using a pre-trained model, it can generate elonmusk's style tweets.

tokenizer = AutoTokenizer.from_pretrained("huggingtweets/elonmusk")  
model = AutoModelWithLMHead.from_pretrained("huggingtweets/elonmusk")
model.eval()
model.cuda()

a sentiment classifier for rl reward

sentiment = pipeline('sentiment-analysis',model="cardiffnlp/twitter-roberta-base-sentiment",tokenizer="cardiffnlp/twitter-roberta-base-sentiment",device=0,return_all_scores=True)
/usr/local/lib/python3.8/dist-packages/transformers/pipelines/text_classification.py:104: UserWarning: `return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.
  warnings.warn(
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.CRITICAL)
sentiment("dogecoin is bad")
[[{'label': 'LABEL_0', 'score': 0.9338533878326416},
  {'label': 'LABEL_1', 'score': 0.06011885032057762},
  {'label': 'LABEL_2', 'score': 0.0060277231968939304}]]
sentiment("dogecoin is bad")[0][0]['score']
0.9338533878326416

set our text generation reward, inverse perplexity + sentiment classifier.

  • inverse perplexity make sure the generated sentence probability will be high.
  • sentiment classifier can make the generate more negative.
class MyRLEnv(TextRLEnv):
    def get_reward(self, input_item, predicted_list, finish): # predicted will be the list of predicted token
      reward = 0
      if finish or len(predicted_list) >= self.env_max_length:
        if 1 < len(predicted_list):
          predicted_text = tokenizer.convert_tokens_to_string(predicted_list)
          # sentiment classifier
          reward += sentiment(input_item[0]+predicted_text)[0][0]['score']
      return reward

fit one example

observaton_list = [['i think dogecoin is']]
env = MyRLEnv(model, tokenizer, observation_input=observaton_list)
actor = TextRLActor(env,model,tokenizer)
agent = actor.agent_ppo(update_interval=10, minibatch_size=10, epochs=10)
actor.predict(observaton_list[0])
' a good idea'
pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=100,
    eval_n_steps=None,
    eval_n_episodes=1,       
    train_max_episode_len=100,  
    eval_interval=10,
    outdir='elon_musk_dogecoin', 
)
/usr/local/lib/python3.8/dist-packages/textrl/actor.py:69: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3277.)
  (prob_ratio.T * advs).T,
/usr/local/lib/python3.8/dist-packages/transformers/pipelines/base.py:1043: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  warnings.warn(
(<textrl.actor.TextPPO at 0x7f902414b6a0>,
 [{'average_value': 2.0006444,
   'average_entropy': 0.17440723,
   'average_value_loss': 1.3542563378810883,
   'average_policy_loss': -0.04205846681725234,
   'n_updates': 10,
   'explained_variance': -2.7000492592458043,
   'eval_score': 0.001555838156491518},
  {'average_value': 1.2077988,
   'average_entropy': 0.10219431,
   'average_value_loss': 0.8049726754426956,
   'average_policy_loss': -0.043247044883901256,
   'n_updates': 20,
   'explained_variance': nan,
   'eval_score': 0.008166163228452206},
  {'average_value': 0.68971634,
   'average_entropy': 0.08226016,
   'average_value_loss': 0.583228854338328,
   'average_policy_loss': -0.03243164799874639,
   'n_updates': 30,
   'explained_variance': nan,
   'eval_score': 0.009085850790143013},
  {'average_value': 0.49251693,
   'average_entropy': 0.07101057,
   'average_value_loss': 0.44462784337811173,
   'average_policy_loss': -0.03321644520212423,
   'n_updates': 40,
   'explained_variance': -112.08653704970607,
   'eval_score': 0.9138848781585693},
  {'average_value': 0.44328108,
   'average_entropy': 0.058479678,
   'average_value_loss': 0.40179401244968177,
   'average_policy_loss': -0.030547107956139286,
   'n_updates': 50,
   'explained_variance': -0.042075806250177594,
   'eval_score': 0.9138848781585693},
  {'average_value': 0.4795684,
   'average_entropy': 0.04970775,
   'average_value_loss': 0.3383651294396259,
   'average_policy_loss': -0.025455901640428043,
   'n_updates': 60,
   'explained_variance': nan,
   'eval_score': 0.9138848781585693},
  {'average_value': 0.5787231,
   'average_entropy': 0.04142314,
   'average_value_loss': 0.29252415439113977,
   'average_policy_loss': -0.02181934324143011,
   'n_updates': 70,
   'explained_variance': -0.2262049999078275,
   'eval_score': 0.9138848781585693},
  {'average_value': 0.6124888,
   'average_entropy': 0.036820583,
   'average_value_loss': 0.2565400848223362,
   'average_policy_loss': -0.01909192969484108,
   'n_updates': 80,
   'explained_variance': -4.575841818340245,
   'eval_score': 0.9138848781585693},
  {'average_value': 0.63870776,
   'average_entropy': 0.033138536,
   'average_value_loss': 0.22813357474113016,
   'average_policy_loss': -0.016970614835801,
   'n_updates': 90,
   'explained_variance': nan,
   'eval_score': 0.9138848781585693},
  {'average_value': 0.6700342,
   'average_entropy': 0.029824698,
   'average_value_loss': 0.20538896852887775,
   'average_policy_loss': -0.015273552626720602,
   'n_updates': 100,
   'explained_variance': -0.4705488598887635,
   'eval_score': 0.9138848781585693}])

loading the best result and predict.

agent.load("./elon_musk_dogecoin/best")
actor.predict(observaton_list[0])
' a hoax'