1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
| # Copyright (c) OpenMMLab. All rights reserved. import torch from datasets import load_dataset from mmengine.dataset import DefaultSampler from mmengine.hooks import ( CheckpointHook, DistSamplerSeedHook, IterTimerHook, LoggerHook, ParamSchedulerHook, ) from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR from peft import LoraConfig from torch.optim import AdamW from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from xtuner.dataset import process_hf_dataset from xtuner.dataset.collate_fns import default_collate_fn from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory from xtuner.engine.hooks import ( DatasetInfoHook, EvaluateChatHook, VarlenAttnArgsToMessageHubHook, ) from xtuner.engine.runner import TrainLoop from xtuner.model import SupervisedFinetune from xtuner.parallel.sequence import SequenceParallelSampler from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
####################################################################### # PART 1 Settings # ####################################################################### **# Model 修改模型路径** **pretrained_model_name_or_path = "/root/autodl-tmp/models/Qwen/Qwen1___5-1___8B-Chat"** use_varlen_attn = False
# Data **alpaca_en_path = "/root/autodl-tmp/code/data/train_data.json" # 自定的json** prompt_template = PROMPT_TEMPLATE.qwen_chat **max_length = 512** pack_to_max_length = True
# parallel sequence_parallel_size = 1
# Scheduler & Optimizer **batch_size = 6** # per_device accumulative_counts = 16 accumulative_counts *= sequence_parallel_size dataloader_num_workers = 0 **max_epochs = 10** optim_type = AdamW lr = 2e-4 betas = (0.9, 0.999) weight_decay = 0 max_norm = 1 # grad clip warmup_ratio = 0.03
# Save save_steps = 500 save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training evaluation_freq = 500 evaluation_freq = 500 **SYSTEM = '你由Veeblue打造的中文领域心理健康助手, 是一个研究过无数具有心理健康问题的病人与心理健康医生对话的心理专家, 在心理方面拥有广博的知识储备和丰富的研究咨询经验,你旨在通过专业心理咨询, 协助来访者完成心理诊断。请充分利用专业心理学知识与咨询技术, 一步步帮助来访者解决心理问题, 接下来你将只使用中文来回答和咨询问题。' evaluation_inputs = [ '请介绍你自己', # self cognition '你好', '我今天心情不好,感觉不开心,很烦。', '我最近总是感到很焦虑,尤其是在学业上。我有个特别崇拜的同学,他好像在各方面都比我优秀,我总觉得自己怎么努力也追不上他,这让我压力特别大。', ]**
####################################################################### # PART 2 Model & Tokenizer # ####################################################################### tokenizer = dict( type=AutoTokenizer.from_pretrained, pretrained_model_name_or_path=pretrained_model_name_or_path, trust_remote_code=True, padding_side="right", )
model = dict( type=SupervisedFinetune, use_varlen_attn=use_varlen_attn, llm=dict( type=AutoModelForCausalLM.from_pretrained, pretrained_model_name_or_path=pretrained_model_name_or_path, trust_remote_code=True, torch_dtype=torch.float16, **quantization_config=None, # 设置None不启用QLora微调 # 若启用QLora: # dict( # type=BitsAndBytesConfig, # load_in_4bit=True, # load_in_8bit=False, # llm_int8_threshold=6.0, # llm_int8_has_fp16_weight=False, # bnb_4bit_compute_dtype=torch.float16, # bnb_4bit_use_double_quant=True, # bnb_4bit_quant_type="nf4", # ),** ), lora=dict( type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM", ), )
####################################################################### # PART 3 Dataset & Dataloader # ####################################################################### alpaca_en = dict( type=process_hf_dataset, **dataset=dict(type=load_dataset, path='json', data_files=alpaca_en_path), #使用自定义的json** tokenizer=tokenizer, max_length=max_length, **dataset_map_fn=None,** template_map_fn=dict(type=template_map_fn_factory, template=prompt_template), remove_unused_columns=True, shuffle_before_pack=True, pack_to_max_length=pack_to_max_length, use_varlen_attn=use_varlen_attn, )
sampler = SequenceParallelSampler if sequence_parallel_size > 1 else DefaultSampler
train_dataloader = dict( batch_size=batch_size, num_workers=dataloader_num_workers, dataset=alpaca_en, sampler=dict(type=sampler, shuffle=True), collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn), )
####################################################################### # PART 4 Scheduler & Optimizer # ####################################################################### # optimizer optim_wrapper = dict( type=AmpOptimWrapper, optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), accumulative_counts=accumulative_counts, loss_scale="dynamic", dtype="float16", )
# learning policy # More information: https: param_scheduler = [ dict( type=LinearLR, start_factor=1e-5, by_epoch=True, begin=0, end=warmup_ratio * max_epochs, convert_to_iter_based=True, ), dict( type=CosineAnnealingLR, eta_min=0.0, by_epoch=True, begin=warmup_ratio * max_epochs, end=max_epochs, convert_to_iter_based=True, ), ]
# train, val, test setting train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
####################################################################### # PART 5 Runtime # ####################################################################### # Log the dialogue periodically during the training process, optional custom_hooks = [ dict(type=DatasetInfoHook, tokenizer=tokenizer), dict( type=EvaluateChatHook, tokenizer=tokenizer, every_n_iters=evaluation_freq, evaluation_inputs=evaluation_inputs, system=SYSTEM, prompt_template=prompt_template, ), ]
if use_varlen_attn: custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
# configure default hooks default_hooks = dict( # record the time of every iteration. timer=dict(type=IterTimerHook), # print log every 10 iterations. logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), # enable the parameter scheduler. param_scheduler=dict(type=ParamSchedulerHook), # save checkpoint per `save_steps`. checkpoint=dict( type=CheckpointHook, by_epoch=False, interval=save_steps, max_keep_ckpts=save_total_limit, ), # set sampler seed in distributed evrionment. sampler_seed=dict(type=DistSamplerSeedHook), )
# configure environment env_cfg = dict( # whether to enable cudnn benchmark cudnn_benchmark=False, # set multi process parameters mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), # set distributed parameters dist_cfg=dict(backend="nccl"), )
# set visualizer visualizer = None
# set log level log_level = "INFO"
# load from which checkpoint load_from = None
# whether to resume training from the loaded checkpoint resume = False
# Defaults to use random seed and disable `deterministic` randomness = dict(seed=None, deterministic=False)
# set log processor log_processor = dict(by_epoch=False)
|