After I managed to wrangle the review text from the HTML it still needed this sort of cleanup:
def clean_text(text):
text = re.sub(r"[\x7f-\x9f]", "", text) # remove control chars
text = re.sub(r"[\xa0\r\t]+", " ", text) # replace with spaces
text = re.sub(r"\n+", "\n", text) # squash runs of newlines
text = re.sub(r"\s+", " ", text) # squash runs of spaces
# Remove newlines unless they appear to be at the end of a sentence
# or if the sentence is shorter than 80 characters.
text = re.sub(r"([^.?!\"\)])\n", r"\1 ", text)
text = re.sub(r"\n([^\n]{,80})\n", r"\1 ", text)
return text.strip()