|
@@ -184,22 +184,20 @@
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"source": [
|
|
|
+ "def clean_text(text):\n",
|
|
|
+ " text = re.sub(r\"http\\S+\", \"\", text)\n",
|
|
|
+ " text = re.sub(r\"@[^\\s]+\", \"\", text)\n",
|
|
|
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
|
|
|
+ " return re.sub(r\"\\^[^ ]+\", \"\", text)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
"def create_conversation_text(data_point):\n",
|
|
|
" text = \"\"\n",
|
|
|
" for item in data_point[\"log\"]:\n",
|
|
|
- " user = item[\"user utterance\"]\n",
|
|
|
- "\n",
|
|
|
- " user = re.sub(r\"http\\S+\", \"\", user)\n",
|
|
|
- " user = re.sub(r\"@[^\\s]+\", \"\", user)\n",
|
|
|
- " user = re.sub(r\"\\s+\", \" \", user)\n",
|
|
|
- " user = re.sub(r\"\\^[^ ]+\", \"\", user)\n",
|
|
|
+ " user = clean_text(item[\"user utterance\"])\n",
|
|
|
" text += f\"user: {user.strip()}\\n\"\n",
|
|
|
"\n",
|
|
|
- " agent = item[\"system response\"]\n",
|
|
|
- " agent = re.sub(r\"http\\S+\", \"\", agent)\n",
|
|
|
- " agent = re.sub(r\"@[^\\s]+\", \"\", agent)\n",
|
|
|
- " agent = re.sub(r\"\\s+\", \" \", agent)\n",
|
|
|
- " agent = re.sub(r\"\\^[^ ]+\", \"\", agent)\n",
|
|
|
+ " agent = clean_text(item[\"system response\"])\n",
|
|
|
" text += f\"agent: {agent.strip()}\\n\"\n",
|
|
|
"\n",
|
|
|
" return text"
|