NLP & Text Analysis with Vertex AI
Building natural language processing models for sentiment analysis, entity extraction, text classification, and custom NLP tasks.
Overview
Vertex AI provides pre-built NLP APIs and AutoML for custom models. Fine-tune Google's foundation models or train entirely custom NLP models.
Text Classification with AutoML
Python - AutoML Text Classification
from google.cloud import aiplatform
# Initialize
aiplatform.init(project="your-project-id", location="us-central1")
# Create text classification dataset
dataset = aiplatform.TextDataset.create(
display_name="product-reviews-classification",
gcs_source="gs://your-bucket/reviews_data.csv", # text, label format
)
# Train text classification model
job = aiplatform.AutoMLTextTrainingJob(
display_name="review-classifier",
prediction_type="classification", # single_label or multi_label
)
model = job.run(
dataset=dataset,
training_filter_split=0.8,
validation_filter_split=0.1,
test_filter_split=0.1,
budget_milli_node_hours=1000,
)
Sentiment Analysis Pipeline
Python - Custom Sentiment Model
from google.cloud import aiplatform
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Training script
def train_sentiment_model():
import pandas as pd
from google.cloud import bigquery
# Load reviews data
client = bigquery.Client()
df = client.query("""
SELECT text, sentiment FROM `your-project.reviews.customer_reviews`
""").to_dataframe()
# Prepare text data
texts = df['text'].values
labels = (df['sentiment'] == 'positive').astype(int).values
# Tokenize
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=100)
# Build model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 128),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# Train
model.fit(padded, labels, epochs=10, validation_split=0.2)
# Save
model.save('gs://your-bucket/sentiment-model')
# Deploy training job
aiplatform.init(project="your-project-id")
job = aiplatform.CustomTrainingJob(
display_name="sentiment-analyzer",
script_path="train_sentiment.py",
container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-12",
)
model = job.run(
replica_count=1,
machine_type="n1-standard-4",
)
Named Entity Recognition (NER)
Python - Custom NER Model
from google.cloud import aiplatform
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
# NER dataset format: One token per line, with BIO tags
def train_ner_model():
import pandas as pd
import numpy as np
# Load training data
sentences = []
tags = []
with open('training_data.txt', 'r') as f:
sentence = []
sentence_tags = []
for line in f:
line = line.strip()
if line:
parts = line.split()
word = parts[0]
tag = parts[1]
sentence.append(word)
sentence_tags.append(tag)
else:
if sentence:
sentences.append(sentence)
tags.append(sentence_tags)
sentence = []
sentence_tags = []
# Create vocabulary
word_set = set()
tag_set = set()
for sentence in sentences:
word_set.update(sentence)
for tag_list in tags:
tag_set.update(tag_list)
word_to_idx = {w: i+1 for i, w in enumerate(word_set)}
tag_to_idx = {t: i for i, t in enumerate(tag_set)}
# Convert to sequences
X = []
Y = []
for sentence, tag_list in zip(sentences, tags):
X.append([word_to_idx.get(w, 0) for w in sentence])
Y.append([tag_to_idx[t] for t in tag_list])
# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, maxlen=50)
Y = pad_sequences(Y, maxlen=50)
# Build model
model = Sequential([
tf.keras.layers.Embedding(len(word_to_idx)+1, 64),
Bidirectional(LSTM(64, return_sequences=True)),
Dense(len(tag_to_idx), activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
model.fit(X, Y, epochs=10)
model.save('gs://your-bucket/ner-model')
# Deploy NER training
aiplatform.init(project="your-project-id")
job = aiplatform.CustomTrainingJob(
display_name="ner-model",
script_path="train_ner.py",
container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-12",
)
model = job.run(replica_count=1, machine_type="n1-standard-4")
Fine-Tuning Foundation Models
Python - Tuned Model API
from google.cloud import aiplatform
# Initialize
aiplatform.init(project="your-project-id", location="us-central1")
# Create tuning job for text generation
tuning_job = aiplatform.TextGenerationTuningJob.from_pretrained(
base_model_id="text-bison@002",
)
# Prepare training data (JSONL format)
training_data = "gs://your-bucket/training_data.jsonl"
# Tuning configuration
tuned_model = tuning_job.tune(
training_data=training_data,
train_steps=300,
learning_rate_multiplier=1.0,
tuned_model_display_name="my-tuned-model",
)
# Deploy tuned model
endpoint = tuned_model.deploy(
machine_type="n1-standard-4",
)
# Make predictions with tuned model
response = endpoint.predict(
instances=[{
"prompt": "Classify this customer review as positive or negative: Great product!"
}]
)
Semantic Search & Embeddings
Python - Text Embeddings
from google.cloud import aiplatform
import numpy as np
# Initialize
aiplatform.init(project="your-project-id")
# Use Vertex Text Embeddings API
from vertexai.language_models import TextEmbeddingModel
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
# Get embeddings for documents
documents = [
"Python is a programming language",
"Java is used for enterprise applications",
"JavaScript runs in web browsers",
]
embeddings = model.get_embeddings(documents)
# Store embeddings in vector store (Vertex Vector Search)
from google.cloud.aiplatform_v1.types.vector_search_service import UpsertDatapointsRequest
# Create index
index = aiplatform.MatchingEngineIndex.create(
display_name="document-index",
contents_delta_uri="gs://your-bucket/embeddings/",
dimensions=768,
approximate_neighbors_count=150,
)
# For semantic search
query = "What programming languages exist?"
query_embedding = model.get_embeddings([query])[0]
# Find similar documents
neighbors = index.find_neighbors(
deployed_index_id="your-deployed-index",
queries=[query_embedding.values],
num_neighbors=5,
)
Question Answering System
Python - QA Model
from google.cloud import aiplatform
from vertexai.language_models import TextGenerationModel
# Initialize
aiplatform.init(project="your-project-id")
# Use Text Generation model for QA
model = TextGenerationModel.from_pretrained("text-bison@002")
# Create prompt for question answering
def answer_question(question, context):
prompt = f"""Answer the question based on the context provided.
Context:
{context}
Question: {question}
Answer:"""
response = model.predict(
prompt=prompt,
temperature=0.2,
max_output_tokens=256,
top_p=0.95,
top_k=40,
)
return response.text
# Example usage
context = """
Python is a high-level programming language. It was created by Guido van Rossum.
Python emphasizes code readability and simplicity.
"""
question = "Who created Python?"
answer = answer_question(question, context)
print(f"Q: {question}")
print(f"A: {answer}")
Batch Text Processing
Python - Large-Scale Processing
from google.cloud import aiplatform
import json
# Batch prediction for NLP
model = aiplatform.Model("your-model-resource-id")
# Prepare batch file (JSONL)
batch_data = []
with open("texts.csv") as f:
for line in f:
batch_data.append({"text": line.strip()})
# Write to GCS
from google.cloud import storage
bucket = storage.Client().bucket("your-bucket")
blob = bucket.blob("batch_input/texts.jsonl")
blob.upload_from_string(
"\n".join([json.dumps(item) for item in batch_data])
)
# Create batch prediction job
batch_job = model.batch_predict(
job_display_name="text-classification-batch",
gcs_source="gs://your-bucket/batch_input/texts.jsonl",
gcs_destination_prefix="gs://your-bucket/batch_output/",
)
# Wait for completion
batch_job.wait()
print(f"Results in: {batch_job.output_info.gcs_output_directory}")
Best Practices
- Start with AutoML for quick prototyping
- Use fine-tuning for domain-specific tasks
- Properly annotate training data with clear labels
- Handle class imbalance in multi-class problems
- Monitor model performance on out-of-domain text
- Use embeddings for semantic similarity tasks
- Implement proper text preprocessing and normalization