python数据提取之re、jsonpath模块

一、re模块--方法

1、匹配指定的字符串，返回所有找到的数据

result = re.findall("bai",test_str)

2、匹配字符串开始的位置，返回的是math对象，需要通过group()函数处理

result = re.match("w",test_str).group()

3、匹配所有的字符串，返回找到的第一个值，需要通过group()函数处理

result = re.search("c",test_str).group()

4、匹配所有的字符串，返回的是一个迭代器

result = re.finditer("w",test_str)

5、代码演示

import re
# pattern--匹配的正则表达式, string--要匹配的字符串
test_str = "www.baidu.com"
result = re.findall("bai",test_str)
print(result)

result = re.match("w",test_str).group()
print(result)

result = re.search("c",test_str).group()
print(result)

result = re.finditer("w",test_str)
for i in result:
    print(i.group())

二、单字符匹配【元字符】

1、.占位匹配任意字符

test_str = "hello python"
result = re.findall("h.", test_str)
print(result)  # 输出结果：['he', 'ho']

2、[ ]：匹配这个[ ]内的任意字符

test_str = "hello python"
result = re.findall("[plh]",test_str)
print(result) # 输出结果：['h', 'l', 'l', 'p', 'h']

3、d 匹配数字 0---9

test_str = "hello999python666"
result = re.findall("d",test_str)
print(result) # 输出结果：['9', '9', '9', '6', '6', '6']

4、D：匹配非数字

test_str = "hello999python666"
result = re.findall("D",test_str)
print(result)    # 输出结果：['h', 'e', 'l', 'l', 'o', 'p', 'y', 't', 'h', 'o', 'n']

5、s：匹配空白 tab 和空格键

test_str = "hel lo pyt hon"
result = re.findall("s",test_str)
print(result)  # 输出结果：[' ', ' ', ' ']

6、S：匹配非空白

test_str = "hello python"
result = re.findall("S",test_str)
print(result)  # 输出结果：['h', 'e', 'l', 'l', 'o', 'p', 'y', 't', 'h', 'o', 'n']

7、w：匹配非特殊字符 _非特殊字符

test_str = "!@#$$$%#@#_++++__python"
result = re.findall("w",test_str)
print(result) # 输出结果：['_', '_', '_', 'p', 'y', 't', 'h', 'o', 'n']

8、W：匹配特殊字符空格也是特殊字符 tab算三个特殊字符

test_str = "!@#$%_+_pyt h   on"
result = re.findall("W",test_str)
print(result) # 输出结果：['!', '@', '#', '$', '%', '+', ' ', ' ', ' ', ' ']

三、多字符匹配

1、*——匹配前一个字符出现0次或者无限次

test_str = "hello python"
result = re.findall("n*",test_str)
print(result) # 输出结果：['', '', '', '', '', '', '', '', '', '', '', 'n', '']

2、+——匹配前一个字符出现1次或者无限次，至少能匹配到一次，贪婪模式

test_str = "goo goo goooo"
result = re.findall("gooo+",test_str)
print(result) # 输出结果：['goooo']

3、?——匹配前一个字符出现0次或者1次，最多匹配1次，非贪婪模式

test_str = "goo go goooo"
result = re.findall("go?",test_str)
print(result) # 输出结果：['go', 'go', 'go']

4、{n}——匹配前一个字符连续出现n次

test_str = "goo go goooo"
result = re.findall("go{2}",test_str)
print(result)  # 输出结果：['goo', 'goo']

5、{m,n}——匹配前一个字符连续出现m到n次

test_str = "gooo go goooo"
result = re.findall("go{3,4}",test_str)
print(result)  # 输出结果：['gooo', 'goooo']

6、代码演示--最常用的方法

# 针对参数替换使用  #(w.+?)#
# ()--分组匹配,将需要获取的数据的正则括起来，直接可以拿到该正则匹配的部分数据
test_str = '{"key1":"#val1#","key2":"#val2#"}'
result = re.findall("#(w.*?)#",test_str)  # ()--分组匹配
print(result)     # 输出结果：['val1', 'val2']

四、逻辑运算符 | 或

test_str = "hello python"
result = re.findall("hn|py",test_str)
print(result)  # 输出结果：['py']

五、边界值

1、^ 匹配字符串开始的位置

test_str = "hello python"
result = re.findall("^he",test_str)
print(result)  # 输出结果：['he']

2、$ 匹配字符串结束位置

test_str = "hello python"
result = re.findall("on$",test_str)
print(result)  # 输出结果：['on']

六、jsonpath

1、安装jsonpath

pip install jsonpath

2、注意事项

$:表示根元素

.:表示子元素

[]:表示子元素

..:递归查找

条件查找：?(@.name=='小简') > < >= <= ==

3、代码演示

from jsonpath import jsonpath
teacher_info = {
    "lemon":{
        "python":[{"name":"Joseph Reed","age":20,"height":180},
                  {"name":"Charles Malone","age":50,"height":185},
                  {"name":"Jason Nguyen","age":20,"height":175},
                  {"name":"Beverly Hoffman","age":30,"height":167},
                  {"name":"Linda Cooper","age":30,"height":167}
                  ],
        "java":[{"name":"Tiffany Anderson","age":30,"height":180},
                  {"name":"Kevin Garza","age":33,"height":185},
                  {"name":"Jacob Brown","age":28,"height":170},
                ],
    }
}

# 子元素 . []   根元素 $
res = jsonpath(teacher_info,"$.lemon.python")
res1 = jsonpath(teacher_info,"$[lemon][java]")
pprint.pprint(res)
pprint.pprint(res1)

# 递归获取
res2 = jsonpath(teacher_info,"$..java")
pprint.pprint(res2)

# 获取指定的属性
res3 = jsonpath(teacher_info,"$..java.[name,age]")
# 通过索引取值
res4 = jsonpath(teacher_info,"$..java.[0,2]")
pprint.pprint(res3)  # 输出结果：['Tiffany Anderson', 30, 'Kevin Garza', 33, 'Jacob Brown', 28]
pprint.pprint(res4)  # 输出结果：[{'age': 30, 'height': 180, 'name': 'Tiffany Anderson'},
                                # {'age': 28, 'height': 170, 'name': 'Jacob Brown'}]


# 通过单条件
res5 = jsonpath(teacher_info,"$..python.[?(@.name=='Joseph Reed')]")
pprint.pprint(res5)

# 通过多条件 and or && ||
res6 = jsonpath(teacher_info,"$..python.[?(@.age<30 && @.height>170)]")
pprint.pprint(res6)

# 通过 not in   in
res7 = jsonpath(teacher_info,"$..java.[?(@.name in ['Kevin Garza'])]")
print(res7)

python数据提取之re、jsonpath模块

Python相关栏目本月热门文章