Here we look at: Detecting hand gestures, creating our starting point and using it to detect four different categories: None, Rock, Paper, Scissors, and adding adding some American Sign Language (ASL) categories to explore how much harder it is for the AI to detect other gestures.
TensorFlow + JavaScript. The most popular, cutting-edge AI framework now supports the most widely used programming language on the planet, so letβs make magic happen through deep learning right in our web browser, GPU-accelerated via WebGL using TensorFlow.js!
In this article, we will take photos of different hand gestures via webcam and use transfer learning on a pre-trained MobileNet model to build a computer vision AI that can recognize the various gestures in real time.
Starting Point
To recognize multiple hand gestures, we are going to use almost-ready starter code and expand it to detect more categories of objects. Here is what the code will do:
- Import TensorFlow.js and TensorFlowβs tf-data.js
- Define Touch vs. Not-Touch category labels
- Add a video element for the webcam
- Run the model prediction every 200 ms after itβs been trained for the first time
- Show the prediction result
- Load a pre-trained MobileNet model and prepare for transfer learning to as many categories as there are labels
- Train and classify a variety of custom objects in images
- Skip disposing image and target samples in the training process to keep them for multiple training runs
Here is our starting point for this project:
<html>
<head>
<meta charset="UTF-8">
<title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
<style>
img, video {
object-fit: cover;
}
</style>
</head>
<body>
<video autoplay playsinline muted id="webcam" width="224" height="224"></video>
<div id="buttons">
<button onclick="captureSample(0)">None</button>
<button onclick="captureSample(1)">β (Rock)</button>
<button onclick="captureSample(2)">π (Paper)</button>
<button onclick="captureSample(3)">βοΈ (Scissors)</button>
<button onclick="trainModel()">Train</button>
</div>
<h1 id="status">Loading...</h1>
<script>
let trainingData = [];
const labels = [
"None",
"β (Rock)",
"π (Paper)",
"βοΈ (Scissors)",
];
function setText( text ) {
document.getElementById( "status" ).innerText = text;
}
async function predictImage() {
if( !hasTrained ) { return; }
const img = await getWebcamImage();
let result = tf.tidy( () => {
const input = img.reshape( [ 1, 224, 224, 3 ] );
return model.predict( input );
});
img.dispose();
let prediction = await result.data();
result.dispose();
let id = prediction.indexOf( Math.max( ...prediction ) );
setText( labels[ id ] );
}
function createTransferModel( model ) {
const bottleneck = model.getLayer( "dropout" );
const baseModel = tf.model({
inputs: model.inputs,
outputs: bottleneck.output
});
for( const layer of baseModel.layers ) {
layer.trainable = false;
}
const newHead = tf.sequential();
newHead.add( tf.layers.flatten( {
inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
} ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
newHead.add( tf.layers.dense( {
units: labels.length,
kernelInitializer: 'varianceScaling',
useBias: false,
activation: 'softmax'
} ) );
const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
return newModel;
}
async function trainModel() {
hasTrained = false;
setText( "Training..." );
const imageSamples = [];
const targetSamples = [];
trainingData.forEach( sample => {
imageSamples.push( sample.image );
let cat = [];
for( let c = 0; c < labels.length; c++ ) {
cat.push( c === sample.category ? 1 : 0 );
}
targetSamples.push( tf.tensor1d( cat ) );
});
const xs = tf.stack( imageSamples );
const ys = tf.stack( targetSamples );
model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );
await model.fit( xs, ys, {
epochs: 30,
shuffle: true,
callbacks: {
onEpochEnd: ( epoch, logs ) => {
console.log( "Epoch #", epoch, logs );
}
}
});
hasTrained = true;
}
const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";
let model = null;
let hasTrained = false;
async function setupWebcam() {
return new Promise( ( resolve, reject ) => {
const webcamElement = document.getElementById( "webcam" );
const navigatorAny = navigator;
navigator.getUserMedia = navigator.getUserMedia ||
navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
navigatorAny.msGetUserMedia;
if( navigator.getUserMedia ) {
navigator.getUserMedia( { video: true },
stream => {
webcamElement.srcObject = stream;
webcamElement.addEventListener( "loadeddata", resolve, false );
},
error => reject());
}
else {
reject();
}
});
}
async function getWebcamImage() {
const img = ( await webcam.capture() ).toFloat();
const normalized = img.div( 127 ).sub( 1 );
return normalized;
}
async function captureSample( category ) {
trainingData.push( {
image: await getWebcamImage(),
category: category
});
setText( "Captured: " + labels[ category ] );
}
let webcam = null;
(async () => {
model = await tf.loadLayersModel( mobilenet );
model = createTransferModel( model );
await setupWebcam();
webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
setInterval( predictImage, 200 );
})();
</script>
</body>
</html>
Detecting Hand Gestures
The starting point is built ready to detect four different categories: None, Rock, Paper, Scissors. You can try it using your webcam by clicking each of the category buttons to capture some photos (5-6 is a good sample to start with) while you are holding each hand gesture, and then clicking the train button to transfer learning to the neural network. After this, you can improve the model by taking more photos and clicking the train button again.
Additional Hand Gestures and Sign Language
As you can probably imagine, adding more categories becomes harder for the AI to learn and takes more time. However, the results are fun, and AI performs fairly well even from just a couple of photos for each category. Letβs try adding some American Sign Language (ASL) gestures.
To add more, you can include more buttons in the input list, updating the number passed into captureSample()
, and modify the labels
array accordingly.
You can add whichever signs you would like. I tried adding four that were part of the emoji set:
- π (Letter D)
- π (Thumb Up)
- π (Vulcan)
- π€ (ILY - I Love You)
Technical Footnotes
- If AI does not seem to recognize your hand gestures well, try taking more photos and then training the model multiple times.
- While training the model with the various hand gestures, keep in mind that it sees the full image; it doesnβt necessarily know that the hand by itself distinguishes the categories. It may be difficult to accurately recognize different hand gestures without numerous samples from different hands.
- Sometimes, the model learns to differentiate between left and right hands, and sometimes it does not, which could affect predictions after multiple rounds of training.
Finish Line
For your reference, here is the full code for this project:
<html>
<head>
<meta charset="UTF-8">
<title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
<style>
img, video {
object-fit: cover;
}
</style>
</head>
<body>
<video autoplay playsinline muted id="webcam" width="224" height="224"></video>
<div id="buttons">
<button onclick="captureSample(0)">None</button>
<button onclick="captureSample(1)">β (Rock)</button>
<button onclick="captureSample(2)">π (Paper)</button>
<button onclick="captureSample(3)">βοΈ (Scissors)</button>
<button onclick="captureSample(4)">π (Letter D)</button>
<button onclick="captureSample(5)">π (Thumb Up)</button>
<button onclick="captureSample(6)">π (Vulcan)</button>
<button onclick="captureSample(7)">π€ (ILY - I Love You)</button>
<button onclick="trainModel()">Train</button>
</div>
<h1 id="status">Loading...</h1>
<script>
let trainingData = [];
const labels = [
"None",
"β (Rock)",
"π (Paper)",
"βοΈ (Scissors)",
"π (Letter D)",
"π (Thumb Up)",
"π (Vulcan)",
"π€ (ILY - I Love You)"
];
function setText( text ) {
document.getElementById( "status" ).innerText = text;
}
async function predictImage() {
if( !hasTrained ) { return; }
const img = await getWebcamImage();
let result = tf.tidy( () => {
const input = img.reshape( [ 1, 224, 224, 3 ] );
return model.predict( input );
});
img.dispose();
let prediction = await result.data();
result.dispose();
let id = prediction.indexOf( Math.max( ...prediction ) );
setText( labels[ id ] );
}
function createTransferModel( model ) {
const bottleneck = model.getLayer( "dropout" );
const baseModel = tf.model({
inputs: model.inputs,
outputs: bottleneck.output
});
for( const layer of baseModel.layers ) {
layer.trainable = false;
}
const newHead = tf.sequential();
newHead.add( tf.layers.flatten( {
inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
} ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
newHead.add( tf.layers.dense( {
units: labels.length,
kernelInitializer: 'varianceScaling',
useBias: false,
activation: 'softmax'
} ) );
const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
return newModel;
}
async function trainModel() {
hasTrained = false;
setText( "Training..." );
const imageSamples = [];
const targetSamples = [];
trainingData.forEach( sample => {
imageSamples.push( sample.image );
let cat = [];
for( let c = 0; c < labels.length; c++ ) {
cat.push( c === sample.category ? 1 : 0 );
}
targetSamples.push( tf.tensor1d( cat ) );
});
const xs = tf.stack( imageSamples );
const ys = tf.stack( targetSamples );
model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );
await model.fit( xs, ys, {
epochs: 30,
shuffle: true,
callbacks: {
onEpochEnd: ( epoch, logs ) => {
console.log( "Epoch #", epoch, logs );
}
}
});
hasTrained = true;
}
const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";
let model = null;
let hasTrained = false;
async function setupWebcam() {
return new Promise( ( resolve, reject ) => {
const webcamElement = document.getElementById( "webcam" );
const navigatorAny = navigator;
navigator.getUserMedia = navigator.getUserMedia ||
navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
navigatorAny.msGetUserMedia;
if( navigator.getUserMedia ) {
navigator.getUserMedia( { video: true },
stream => {
webcamElement.srcObject = stream;
webcamElement.addEventListener( "loadeddata", resolve, false );
},
error => reject());
}
else {
reject();
}
});
}
async function getWebcamImage() {
const img = ( await webcam.capture() ).toFloat();
const normalized = img.div( 127 ).sub( 1 );
return normalized;
}
async function captureSample( category ) {
trainingData.push( {
image: await getWebcamImage(),
category: category
});
setText( "Captured: " + labels[ category ] );
}
let webcam = null;
(async () => {
model = await tf.loadLayersModel( mobilenet );
model = createTransferModel( model );
await setupWebcam();
webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
setInterval( predictImage, 200 );
})();
</script>
</body>
</html>
Whatβs Next?
This project showed you how to start training your own computer vision AI to recognize potentially unlimited gestures, objects, species of animals, or even types of foods. The rest is up to you; the future of deep learning and AI might start right within your browser.
I hope you enjoyed following along with these examples. And as you experiment with more ideas, donβt forget to have fun!
Raphael Mun is a tech entrepreneur and educator who has been developing software professionally for over 20 years. He currently runs Lemmino, Inc and teaches and entertains through his Instafluff livestreams on Twitch building open source projects with his community.