Add new models, datasets, videos to task pages (#836)

merveenoyan · pcuenca · web-flow · commit 099a25ee61a0 · 2024-08-08T12:36:13.000+03:00
---------

Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
diff --git a/packages/tasks/src/tasks/feature-extraction/data.ts b/packages/tasks/src/tasks/feature-extraction/data.ts
@@ -42,9 +42,13 @@ const taskData: TaskDataCustom = {
 	],
 	spaces: [
 		{
-			description: "A leaderboard to rank best feature extraction models..",
+			description: "A leaderboard to rank text feature extraction models based on a benchmark.",
 			id: "mteb/leaderboard",
 		},
+		{
+			description: "A leaderboard to rank best feature extraction models based on human feedback.",
+			id: "mteb/arena",
+		},
 	],
 	summary: "Feature extraction is the task of extracting features learnt in a model.",
 	widgetModels: ["facebook/bart-base"],
diff --git a/packages/tasks/src/tasks/image-feature-extraction/data.ts b/packages/tasks/src/tasks/image-feature-extraction/data.ts
@@ -36,16 +36,20 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A strong image feature extraction model.",
-			id: "google/vit-base-patch16-224-in21k",
+			id: "nvidia/MambaVision-T-1K",
 		},
 		{
-			description: "A robust image feature extraction models.",
+			description: "A robust image feature extraction model.",
 			id: "facebook/dino-vitb16",
 		},
 		{
-			description: "Strong image-text-to-text model made for information retrieval from documents.",
+			description: "Strong image feature extraction model made for information retrieval from documents.",
 			id: "vidore/colpali",
 		},
+		{
+			description: "Strong image feature extraction model that can be used on images and documents.",
+			id: "OpenGVLab/InternViT-6B-448px-V1-2",
+		},
 	],
 	spaces: [],
 	summary: "Image feature extraction is the task of extracting features learnt in a computer vision model.",
diff --git a/packages/tasks/src/tasks/image-segmentation/data.ts b/packages/tasks/src/tasks/image-segmentation/data.ts
@@ -48,16 +48,16 @@ const taskData: TaskDataCustom = {
 			id: "facebook/detr-resnet-50-panoptic",
 		},
 		{
-			description: "Semantic segmentation model trained on ADE20k benchmark dataset.",
-			id: "microsoft/beit-large-finetuned-ade-640-640",
+			description: "Background removal model.",
+			id: "briaai/RMBG-1.4",
 		},
 		{
 			description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",
 			id: "nvidia/segformer-b0-finetuned-ade-512-512",
 		},
 		{
-			description: "Semantic segmentation model trained Cityscapes dataset.",
-			id: "facebook/mask2former-swin-large-cityscapes-semantic",
+			description: "A multipurpose image segmentation model for high resolution images.",
+			id: "ZhengPeng7/BiRefNet",
 		},
 		{
 			description: "Panoptic segmentation model trained COCO (common objects) dataset.",
diff --git a/packages/tasks/src/tasks/image-text-to-text/about.md b/packages/tasks/src/tasks/image-text-to-text/about.md
@@ -72,3 +72,5 @@ print(processor.decode(output[0], skip_special_tokens=True))
 - [Vision Language Models Explained](https://huggingface.co/blog/vlms)
 - [Open-source Multimodality and How to Achieve it using Hugging Face](https://www.youtube.com/watch?v=IoGaGfU1CIg&t=601s)
 - [Introducing Idefics2: A Powerful 8B Vision-Language Model for the community](https://huggingface.co/blog/idefics2)
+- [Image-text-to-text task guide](https://huggingface.co/tasks/image-text-to-text)
+- [Preference Optimization for Vision Language Models with TRL](https://huggingface.co/blog/dpo_vlm)
diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts
@@ -88,7 +88,7 @@ const taskData: TaskDataCustom = {
 	summary:
 		"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
 	widgetModels: ["microsoft/kosmos-2-patch14-224"],
-	youtubeId: "",
+	youtubeId: "IoGaGfU1CIg",
 };
 
 export default taskData;
diff --git a/packages/tasks/src/tasks/image-to-image/data.ts b/packages/tasks/src/tasks/image-to-image/data.ts
@@ -45,8 +45,8 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description: "A model that enhances images captured in low light conditions.",
-			id: "keras-io/low-light-image-enhancement",
+			description: "An image-to-image model to improve image resolution.",
+			id: "fal/AuraSR-v2",
 		},
 		{
 			description: "A model that increases the resolution of an image.",
diff --git a/packages/tasks/src/tasks/mask-generation/data.ts b/packages/tasks/src/tasks/mask-generation/data.ts
@@ -24,22 +24,22 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Very strong mask generation model.",
-			id: "facebook/sam-vit-huge",
+			id: "facebook/sam2-hiera-large",
 		},
 	],
 	spaces: [
 		{
 			description:
-				"An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
-			id: "SkalskiP/SAM_and_MetaCLIP",
+				"An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",
+			id: "merve/OWLSAM2",
 		},
 		{
 			description: "An application that compares the performance of a large and a small mask generation model.",
 			id: "merve/slimsam",
 		},
 		{
 			description: "An application based on an improved mask generation model.",
-			id: "linfanluntan/Grounded-SAM",
+			id: "SkalskiP/segment-anything-model-2",
 		},
 		{
 			description: "An application to remove objects from videos using mask generation models.",
diff --git a/packages/tasks/src/tasks/text-generation/data.ts b/packages/tasks/src/tasks/text-generation/data.ts
@@ -19,6 +19,10 @@ const taskData: TaskDataCustom = {
 			description: "An instruction dataset with preference ratings on responses.",
 			id: "openbmb/UltraFeedback",
 		},
+		{
+			description: "A large synthetic dataset for alignment of text generation models.",
+			id: "argilla/magpie-ultra-v0.1",
+		},
 	],
 	demo: {
 		inputs: [
@@ -51,32 +55,32 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description: "A large language model trained for text generation.",
-			id: "bigscience/bloom-560m",
+			description: "A text-generation model trained to follow instructions.",
+			id: "google/gemma-2-2b-it",
 		},
 		{
-			description: "A large code generation model that can generate code in 80+ languages.",
+			description: "A code generation model that can generate code in 80+ languages.",
 			id: "bigcode/starcoder",
 		},
 		{
-			description: "A very powerful text generation model.",
-			id: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+			description: "Very powerful text generation model trained to follow instructions.",
+			id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
 		},
 		{
 			description: "Small yet powerful text generation model.",
-			id: "microsoft/phi-2",
+			id: "microsoft/Phi-3-mini-4k-instruct",
 		},
 		{
-			description: "A very powerful model that can chat, do mathematical reasoning and write code.",
-			id: "openchat/openchat-3.5-0106",
+			description: "A very powerful model that can solve mathematical problems.",
+			id: "AI-MO/NuminaMath-7B-TIR",
 		},
 		{
-			description: "Very strong yet small assistant model.",
-			id: "HuggingFaceH4/zephyr-7b-beta",
+			description: "Strong coding assistant model.",
+			id: "HuggingFaceH4/starchat2-15b-v0.1",
 		},
 		{
 			description: "Very strong open-source large language model.",
-			id: "meta-llama/Llama-2-70b-hf",
+			id: "mistralai/Mistral-Nemo-Instruct-2407",
 		},
 	],
 	spaces: [
@@ -104,7 +108,7 @@ const taskData: TaskDataCustom = {
 	summary:
 		"Generating text is the task of generating new text given another text. These models can, for example, fill in incomplete text or paraphrase.",
 	widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
-	youtubeId: "Vpjb1lu0MDk",
+	youtubeId: "e9gNEAlsOvU",
 };
 
 export default taskData;
diff --git a/packages/tasks/src/tasks/text-to-image/data.ts b/packages/tasks/src/tasks/text-to-image/data.ts
@@ -46,15 +46,15 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			description: "One of the most powerful image generation models that can generate realistic outputs.",
-			id: "stabilityai/stable-diffusion-xl-base-1.0",
+			id: "black-forest-labs/FLUX.1-dev",
 		},
 		{
 			description: "A powerful yet fast image generation model.",
 			id: "latent-consistency/lcm-lora-sdxl",
 		},
 		{
-			description: "A very fast text-to-image model.",
-			id: "ByteDance/SDXL-Lightning",
+			description: "Text-to-image model for photorealistic generation.",
+			id: "Kwai-Kolors/Kolors",
 		},
 		{
 			description: "A powerful text-to-image model.",

Original file line number	Diff line number	Diff line change
`@@ -48,16 +48,16 @@ const taskData: TaskDataCustom = {`
`48`	`48`	`id: "facebook/detr-resnet-50-panoptic",`
`49`	`49`	`},`
`50`	`50`	`{`
`51`		`- description: "Semantic segmentation model trained on ADE20k benchmark dataset.",`
`52`		`- id: "microsoft/beit-large-finetuned-ade-640-640",`
	`51`	`+ description: "Background removal model.",`
	`52`	`+ id: "briaai/RMBG-1.4",`
`53`	`53`	`},`
`54`	`54`	`{`
`55`	`55`	`description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",`
`56`	`56`	`id: "nvidia/segformer-b0-finetuned-ade-512-512",`
`57`	`57`	`},`
`58`	`58`	`{`
`59`		`- description: "Semantic segmentation model trained Cityscapes dataset.",`
`60`		`- id: "facebook/mask2former-swin-large-cityscapes-semantic",`
	`59`	`+ description: "A multipurpose image segmentation model for high resolution images.",`
	`60`	`+ id: "ZhengPeng7/BiRefNet",`
`61`	`61`	`},`
`62`	`62`	`{`
`63`	`63`	`description: "Panoptic segmentation model trained COCO (common objects) dataset.",`
Original file line number	Diff line number	Diff line change
`@@ -45,8 +45,8 @@ const taskData: TaskDataCustom = {`
`45`	`45`	`],`
`46`	`46`	`models: [`
`47`	`47`	`{`
`48`		`- description: "A model that enhances images captured in low light conditions.",`
`49`		`- id: "keras-io/low-light-image-enhancement",`
	`48`	`+ description: "An image-to-image model to improve image resolution.",`
	`49`	`+ id: "fal/AuraSR-v2",`
`50`	`50`	`},`
`51`	`51`	`{`
`52`	`52`	`description: "A model that increases the resolution of an image.",`
Original file line number	Diff line number	Diff line change
`@@ -24,22 +24,22 @@ const taskData: TaskDataCustom = {`
`24`	`24`	`},`
`25`	`25`	`{`
`26`	`26`	`description: "Very strong mask generation model.",`
`27`		`- id: "facebook/sam-vit-huge",`
	`27`	`+ id: "facebook/sam2-hiera-large",`
`28`	`28`	`},`
`29`	`29`	`],`
`30`	`30`	`spaces: [`
`31`	`31`	`{`
`32`	`32`	`description:`
`33`		`- "An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",`
`34`		`- id: "SkalskiP/SAM_and_MetaCLIP",`
	`33`	`+ "An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",`
	`34`	`+ id: "merve/OWLSAM2",`
`35`	`35`	`},`
`36`	`36`	`{`
`37`	`37`	`description: "An application that compares the performance of a large and a small mask generation model.",`
`38`	`38`	`id: "merve/slimsam",`
`39`	`39`	`},`
`40`	`40`	`{`
`41`	`41`	`description: "An application based on an improved mask generation model.",`
`42`		`- id: "linfanluntan/Grounded-SAM",`
	`42`	`+ id: "SkalskiP/segment-anything-model-2",`
`43`	`43`	`},`
`44`	`44`	`{`
`45`	`45`	`description: "An application to remove objects from videos using mask generation models.",`
Original file line number	Diff line number	Diff line change
`@@ -46,15 +46,15 @@ const taskData: TaskDataCustom = {`
`46`	`46`	`models: [`
`47`	`47`	`{`
`48`	`48`	`description: "One of the most powerful image generation models that can generate realistic outputs.",`
`49`		`- id: "stabilityai/stable-diffusion-xl-base-1.0",`
	`49`	`+ id: "black-forest-labs/FLUX.1-dev",`
`50`	`50`	`},`
`51`	`51`	`{`
`52`	`52`	`description: "A powerful yet fast image generation model.",`
`53`	`53`	`id: "latent-consistency/lcm-lora-sdxl",`
`54`	`54`	`},`
`55`	`55`	`{`
`56`		`- description: "A very fast text-to-image model.",`
`57`		`- id: "ByteDance/SDXL-Lightning",`
	`56`	`+ description: "Text-to-image model for photorealistic generation.",`
	`57`	`+ id: "Kwai-Kolors/Kolors",`
`58`	`58`	`},`
`59`	`59`	`{`
`60`	`60`	`description: "A powerful text-to-image model.",`