could not extract href data #366

fengyunzaidushi · 2024-12-23T11:21:47Z

import asyncio
import json
import os
from datetime import datetime
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from pydantic import BaseModel, Field
from typing import List, Optional
from rich.console import Console
from rich import print

console = Console()

class BlogPost(BaseModel):
  href: Optional[str] = Field(None, description="文章链接")
  title: str = Field(..., description="文章标题")
  date: str = Field(..., description="发布日期")
  summarize: Optional[str] = Field(None, description="文章摘要")
  author: Optional[str] = Field(None, description="作者")

def ensure_dir(directory):
  """确保目录存在，如果不存在则创建"""
  if not os.path.exists(directory):
      os.makedirs(directory)

def save_results(posts_data, directory="cursor_blog"):
  """保存提取结果到json文件"""
  ensure_dir(directory)
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  filename = f"{directory}/posts_data_{timestamp}.json"
  
  with open(filename, 'w', encoding='utf-8') as f:
      json.dump(posts_data, f, ensure_ascii=False, indent=2)
  
  return filename

async def main():
  try:
      console.print("🌟 [bold cyan]开始提取Cursor博客文章...[/bold cyan]")
      
      # 定义通用的爬取配置
      wait_for_config = """() => {
          const footer = document.querySelector('footer.container-full');
          const articles = document.querySelectorAll("a[href^='/blog/']:not([href='/blog'])");
          return footer && articles.length > 0;
      }"""
      
      js_code_config = """
      (async () => {
          await new Promise(r => setTimeout(r, 1000));
          window.scrollTo(0, document.body.scrollHeight);
          await new Promise(r => setTimeout(r, 1000));
      })();
      """
      
      # 定义提取schema
      schema = {
          "name": "Cursor Blog Posts",
          "baseSelector": "a.relative.justify-between",  
          "fields": [
              {
              "name": "href",
              "selector": "a[href]",
              "type": "attribute",
              "attribute": "href",
              },
              {
                  "name": "title",
                  "selector": "h2",
                  "type": "text",
                  "default": "无标题"
              },
              {
                  "name": "date",
                  "selector": "time",
                  "type": "text",
                  "default": ""
              },
              {
                  "name": "summarize",
                  "selector": "p.hidden.text-brand-neutrals-600, p.text-brand-neutrals-600",
                  "type": "text",
                  "default": ""
              },
              {
                  "name": "author",
                  "selector": "p.text-brand-gray-800",
                  "type": "text",
                  "transform": "lambda x: x.replace('By', '') if x else ''",
                  "default": ""
              }
          ],
          "excludeFields": ["time_read"]  # 明确排除time_read字段
      }
      
      extraction_strategy = JsonCssExtractionStrategy(schema=schema)
      async with AsyncWebCrawler(verbose=True) as crawler:
          console.print("⏳ [yellow]正在爬取页面...[/yellow]")
          
          # 使用提取策略爬取
          result = await crawler.arun(
              url="https://www.cursor.com/blog",
              extraction_strategy=extraction_strategy,
              cache_mode=CacheMode.DISABLED,  # 禁用缓存
              wait_for=wait_for_config,
              js_code=js_code_config,
              exclude_external_links=True,
              exclude_social_media_links=True,
          )
          
          if result.success:
              try:
                  # 解析JSON字符串为Python对象
                  posts_data = json.loads(result.extracted_content)
                  

                  
                  
                  print('posts_data',posts_data)
                  if posts_data:
                      console.print(f"✅ [bold green]成功提取 {len(posts_data)} 篇文章![/bold green]")
                      

                      filename = save_results(posts_data)
                      console.print(f"💾 [bold green]结果已保存到: {filename}[/bold green]")
                  
                  else:
                      console.print("[bold red]警告: 没有提取到有效的文章数据[/bold red]")
              except json.JSONDecodeError as e:
                  console.print(f"❌ [bold red]JSON解析错误: {str(e)}[/bold red]")
          else:
              console.print(f"❌ [bold red]提取失败: {result.error_message}[/bold red]")
  
  except Exception as e:
      console.print(f"❌ [bold red]发生错误: {str(e)}[/bold red]")

if __name__ == "__main__":
  asyncio.run(main())

the above code success extract data

{
  "title": "More Problems",
  "date": "May 25, 2024",
  "summarize": "Several exciting problem areas for the next phase of AI-programming.",
  "author": "ByAnysphere Team"
},

but the field href ,could not get it.
I know href could get from internal links, but that's too much, I just want to get the blog href and
want it correspond with the blog metadata

The text was updated successfully, but these errors were encountered:

unclecode · 2024-12-25T12:26:01Z

Thanks for using Crawl4ai. You mentioned something that is correct. When you want to extract attributes in the base element, the way you create the schema is a little different. I edited your code and pasted it here. Also, please wait and use the new version 0.4.24, which I will release in a day or two. There have been some changes, and they tested it. It may not work properly.

When you define the schema and want to take some attributes from the base element, you must pass the base fields, which is a list of attributes you want to extract. Look at the following code to understand it.

        schema = {
            "name": "Cursor Blog Posts",
            "baseSelector": "a.relative.justify-between",
            "baseFields": [
                {"name": "href", "type": "attribute", "attribute": "href"},
            ],
            "fields": [
            ]
        }

By the way right now there is no such "excludeFields", extraction strategy just focus on those you defined.

I hope this helps you.

import asyncio
import json
import os
from datetime import datetime
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from pydantic import BaseModel, Field
from typing import List, Optional

class BlogPost(BaseModel):
    href: Optional[str] = Field(None, description="文章链接")
    title: str = Field(..., description="文章标题")
    date: str = Field(..., description="发布日期")
    summarize: Optional[str] = Field(None, description="文章摘要")
    author: Optional[str] = Field(None, description="作者")


def ensure_dir(directory):
    """确保目录存在，如果不存在则创建"""
    if not os.path.exists(directory):
        os.makedirs(directory)


def save_results(posts_data, directory="cursor_blog"):
    """保存提取结果到json文件"""
    ensure_dir(directory)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{directory}/posts_data_{timestamp}.json"

    with open(filename, "w", encoding="utf-8") as f:
        json.dump(posts_data, f, ensure_ascii=False, indent=2)

    return filename


async def main():
    try:
        print("🌟 [bold cyan]开始提取Cursor博客文章...[/bold cyan]")

        # 定义通用的爬取配置
        wait_for_config = """() => {
          const footer = document.querySelector('footer.container-full');
          const articles = document.querySelectorAll("a[href^='/blog/']:not([href='/blog'])");
          return footer && articles.length > 0;
      }"""

        js_code_config = """
      (async () => {
          await new Promise(r => setTimeout(r, 1000));
          window.scrollTo(0, document.body.scrollHeight);
          await new Promise(r => setTimeout(r, 1000));
      })();
      """

        # 定义提取schema
        schema = {
            "name": "Cursor Blog Posts",
            "baseSelector": "a.relative.justify-between",
            "baseFields": [
                {"name": "href", "type": "attribute", "attribute": "href"},
            ],
            "fields": [

                {
                    "name": "title",
                    "selector": "h2",
                    "type": "text",
                    "default": "无标题",
                },
                {"name": "date", "selector": "time", "type": "text", "default": ""},
                {
                    "name": "summarize",
                    "selector": "p.hidden.text-brand-neutrals-600, p.text-brand-neutrals-600",
                    "type": "text",
                    "default": "",
                },
                {
                    "name": "author",
                    "selector": "p.text-brand-gray-800",
                    "type": "text",
                    "transform": "lambda x: x.replace('By', '') if x else ''",
                    "default": "",
                },
            ],
            "excludeFields": ["time_read"],  # 明确排除time_read字段
        }

        extraction_strategy = JsonCssExtractionStrategy(schema=schema)
        async with AsyncWebCrawler(verbose=True) as crawler:
            print("⏳ [yellow]正在爬取页面...[/yellow]")

            # 使用提取策略爬取
            result = await crawler.arun(
                url="https://www.cursor.com/blog",
                extraction_strategy=extraction_strategy,
                cache_mode=CacheMode.DISABLED,  # 禁用缓存
                wait_for=wait_for_config,
                js_code=js_code_config,
                exclude_external_links=True,
                exclude_social_media_links=True,
            )

            if result.success:
                try:
                    # 解析JSON字符串为Python对象
                    posts_data = json.loads(result.extracted_content)

                    print("posts_data", posts_data)
                    if posts_data:
                        print(
                            f"✅ [bold green]成功提取 {len(posts_data)} 篇文章![/bold green]"
                        )

                        filename = save_results(posts_data)
                        print(f"💾 [bold green]结果已保存到: {filename}[/bold green]")

                    else:
                        print("[bold red]警告: 没有提取到有效的文章数据[/bold red]")
                except json.JSONDecodeError as e:
                    print(f"❌ [bold red]JSON解析错误: {str(e)}[/bold red]")
            else:
                print(f"❌ [bold red]提取失败: {result.error_message}[/bold red]")

    except Exception as e:
        print(f"❌ [bold red]发生错误: {str(e)}[/bold red]")


if __name__ == "__main__":
    asyncio.run(main())

Outpur:

posts_data [{'href': '/blog/shadow-workspace', 'title': 'Shadow Workspace: Iterating on Code in the Background', ...

unclecode closed this as completed Dec 25, 2024

unclecode self-assigned this Dec 25, 2024

unclecode added the question Further information is requested label Dec 25, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

could not extract href data #366

could not extract href data #366

fengyunzaidushi commented Dec 23, 2024

unclecode commented Dec 25, 2024

could not extract href data #366

could not extract href data #366

Comments

fengyunzaidushi commented Dec 23, 2024

unclecode commented Dec 25, 2024